1260 files changed, 258382 insertions, 134746 deletions
diff --git a/.gitignore b/.gitignore
index c5371b264..ff253c1da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,26 +8,41 @@ install-sh
 ltmain.sh
 Makefile.in
 missing
+py-compile
 *.sw?
 *~
-*lo
-*la
-*o
+*.lo
+*.la
+*.o
+*.tar.gz
+*.rpm
 .libs
+.deps
 Makefile
 stamp-h1
 
 # Generated files
-extras/init.d/glusterfs-server.plist
-extras/init.d/glusterfsd-Debian
-extras/init.d/glusterfsd-Redhat
-extras/init.d/glusterfsd-SuSE
-glusterfs.spec
-libtool
-xlators/mount/fuse/utils/mount.glusterfs
-xlators/mount/fuse/utils/mount_glusterfs
+api/examples/__init__.py*
+api/examples/setup.py
 argp-standalone/libargp.a
+contrib/uuid/uuid_types.h
+extras/init.d/glusterd-Debian
+extras/init.d/glusterd-Redhat
+extras/init.d/glusterd-SuSE
+extras/init.d/glusterd.plist
+extras/ocf/glusterd
+extras/ocf/volume
+extras/who-wrote-glusterfs/gitdm
+glusterfs-api.pc
+glusterfs.spec
 glusterfsd/src/glusterfsd
+libgfchangelog.pc
 libglusterfs/src/spec.lex.c
 libglusterfs/src/y.tab.c
 libglusterfs/src/y.tab.h
+libtool
+run-tests.sh
+ufo/.tox
+ufo/test/unit/.coverage
+xlators/mount/fuse/utils/mount.glusterfs
+xlators/mount/fuse/utils/mount_glusterfs
diff --git a/.mailmap b/.mailmap
new file mode 100644
index 000000000..6bcd95dea
--- /dev/null
+++ b/.mailmap
@@ -0,0 +1,31 @@
+# .mailmap, see 'git short-log --help' for details
+#
+# Listing of contributors that filed patches with different email addresses.
+# Format: <name> <main-email> <alias> [<alias> ...]
+#
+
+Amar Tumballi <amarts@redhat.com> <amar@gluster.com> <amar@del.gluster.com>
+Anand Avati <avati@redhat.com> <avati@gluster.com> <avati@dev.gluster.com> <avati@amp.gluster.com> <avati@blackhole.gluster.com>
+Anush Shetty <ashetty@redhat.com> <anush@gluster.com>
+Csaba Henk <csaba@redhat.com> <csaba@gluster.com> <csaba@lowlife.hu> <csaba@zresearch.com>
+Harshavardhana <fharshav@redhat.com> <harsha@gluster.com> <harsha@zresearch.com> <harsha@dev.gluster.com> <harsha@harshavardhana.net>
+Kaleb S. KEITHLEY <kkeithle@redhat.com> <kkeithle@f16node1.kkeithle.usersys.redhat.com>
+Kaushal M <kaushal@redhat.com> <kaushal@gluster.com>
+Kaushik BV <kbudiger@redhat.com> <kaushikbv@gluster.com>
+Krishna Srinivas <ksriniva@redhat.com> <krishna@gluster.com> <krishna@zresearch.com> <krishna@guest-laptop>
+Krishnan Parthasarathi <kparthas@redhat.com> <kp@gluster.com>
+Louis Zuckerman <louiszuckerman@gmail.com> <me@louiszuckerman.com>
+M S Vishwanath Bhat <vbhat@redhat.com> <msvbhat@gmail.com> <vishwanath@gluster.com>
+Pavan Sondur <pavan@gluster.com> <pavan@dev.gluster.com>
+Pete Zaitcev <zaitcev@kotori.zaitcev.us> <zaitcev@yahoo.com>
+Pranith Kumar K <pkarampu@redhat.com> <pranithk@gluster.com>
+Raghavendra Bhat <raghavendra@redhat.com> <raghavendrabhat@gluster.com>
+Raghavendra G <rgowdapp@redhat.com> <raghavendra@gluster.com> <raghavendra@zresearch.com>
+Rahul C S <rahulcs@redhat.com> <rahulcssjce@gmail.com>
+Rajesh Amaravathi <rajesh@redhat.com> <rajesh@gluster.com> <rajesh.amaravathi@gmail.com>
+Shehjar Tikoo <shehjart@gluster.com> <shehjart@zresearch.com>
+Venky Shankar <vshankar@redhat.com> <venky@gluster.com>
+Vijay Bellur <vbellur@redhat.com> <vijay@gluster.com> <vijay@dev.gluster.com>
+Vijaykumar Koppad <vkoppad@redhat.com> <vijaykumar.koppad@gmail.com>
+Vikas Gorur <vikas@gluster.com> <vikas@zresearch.com>
+shishir gowda <sgowda@redhat.com> <shishirng@gluster.com>
diff --git a/CONTRIBUTING b/CONTRIBUTING
new file mode 100644
index 000000000..7bccd88d7
--- /dev/null
+++ b/CONTRIBUTING
@@ -0,0 +1,25 @@
+        Developer's Certificate of Origin 1.1
+
+        By making a contribution to this project, I certify that:
+
+        (a) The contribution was created in whole or in part by me and I
+            have the right to submit it under the open source license
+            indicated in the file; or
+
+        (b) The contribution is based upon previous work that, to the best
+            of my knowledge, is covered under an appropriate open source
+            license and I have the right under that license to submit that
+            work with modifications, whether created in whole or in part
+            by me, under the same open source license (unless I am
+            permitted to submit under a different license), as indicated
+            in the file; or
+
+        (c) The contribution was provided directly to me by some other
+            person who certified (a), (b) or (c) and I have not modified
+            it.
+
+        (d) I understand and agree that this project and the contribution
+            are public and that a record of the contribution (including all
+            personal information I submit with it, including my sign-off) is
+            maintained indefinitely and may be redistributed consistent with
+            this project or the open source license(s) involved.
diff --git a/COPYING b/COPYING
deleted file mode 100644
index 5737cfa27..000000000
--- a/COPYING
+++ /dev/null
@@ -1,674 +0,0 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/COPYING-GPLV2 b/COPYING-GPLV2
new file mode 100644
index 000000000..d159169d1
--- /dev/null
+++ b/COPYING-GPLV2
@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/COPYING-LGPLV3 b/COPYING-LGPLV3
new file mode 100644
index 000000000..65c5ca88a
--- /dev/null
+++ b/COPYING-LGPLV3
@@ -0,0 +1,165 @@
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/EXCEPTIONS b/EXCEPTIONS
deleted file mode 100644
index 4f1845cd4..000000000
--- a/EXCEPTIONS
+++ /dev/null
@@ -1,14 +0,0 @@
-GlusterFS exceptions to GNU GPL
-
-Copyright © 2011 Gluster, Inc.
-
-Everyone is permitted to copy and distribute verbatim copies of this license
-document, but changing it is not allowed.
-Gluster has developed the APIs and the libraries in its code to enable third
-parties to link their software to GlusterFS software without requiring that
-such third party software be licensed under the GNU General Public License
-version 3 ("GNU GPL3"). Consequently, Gluster does not view the linking of
-third party software to GlusterFS through APIs or the libraries as requiring
-the combination to be distributed under GNU GPLv3. However, your reproduction,
-modification or other use of the GlusterFS software alone must comply with the
-terms of GNU GPL3.
diff --git a/Makefile.am b/Makefile.am
index 6a5bc0251..598ebb410 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,8 +1,18 @@
-EXTRA_DIST = autogen.sh COPYING INSTALL README AUTHORS THANKS NEWS EXCEPTIONS glusterfs.spec
+EXTRA_DIST = autogen.sh \
+	COPYING-GPLV2 COPYING-LGPLV3 \
+	INSTALL README AUTHORS THANKS NEWS \
+	glusterfs.spec glusterfs-api.pc.in libgfchangelog.pc.in \
+	error-codes.json gf-error-codes.h.template \
+	gen-headers.py run-tests.sh \
+	$(shell find $(top_srcdir)/tests -type f -print)
 
-SUBDIRS = argp-standalone libglusterfs rpc xlators glusterfsd $(FUSERMOUNT_SUBDIR) doc extras cli
+SUBDIRS = argp-standalone libglusterfs rpc api xlators glusterfsd \
+	$(FUSERMOUNT_SUBDIR) doc extras cli @SYNCDAEMON_SUBDIR@
 
-CLEANFILES = 
+pkgconfigdir = @pkgconfigdir@
+pkgconfig_DATA = glusterfs-api.pc libgfchangelog.pc
+
+CLEANFILES =
 
 gitclean: distclean
 	find . -name Makefile.in -exec rm -f {} \;
@@ -11,4 +21,9 @@ gitclean: distclean
 	rm -fr autom4te.cache
 	rm -f missing aclocal.m4 config.h.in config.guess config.sub ltmain.sh install-sh configure depcomp
 	rm -fr argp-standalone/autom4te.cache
-	rm -f argp-standalone/aclocal.m4 argp-standalone/config.h.in argp-standalone/configure argp-standalone/depcomp argp-standalone/install-sh argp-standalone/missing
+	rm -f argp-standalone/aclocal.m4 argp-standalone/config.h.in
+	rm -f argp-standalone/configure argp-standalone/depcomp
+	rm -f argp-standalone/install-sh argp-standalone/missing
+
+dist-hook:
+	(cd $(srcdir) && git diff && echo ===== git log ==== && git log) > $(distdir)/ChangeLog
diff --git a/THANKS b/THANKS
index ea9816662..e1ce5105d 100644
--- a/THANKS
+++ b/THANKS
@@ -1,3 +1 @@
-
-For all of you, who use the product and help us making it more robust, useful, popular.
-
+For all of you, who use the product and help us making it more robust, useful, popular..
diff --git a/api/Makefile.am b/api/Makefile.am
new file mode 100644
index 000000000..f0ad1ee97
--- /dev/null
+++ b/api/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src examples
diff --git a/api/examples/Makefile.am b/api/examples/Makefile.am
new file mode 100644
index 000000000..05f40ff53
--- /dev/null
+++ b/api/examples/Makefile.am
@@ -0,0 +1,6 @@
+EXTRA_PROGRAMS = glfsxmp
+glfsxmp_SOURCES = glfsxmp.c
+glfsxmp_CFLAGS = $(GLFS_CFLAGS) -Wall
+glfsxmp_LDADD = $(GLFS_LIBS) -lrt
+
+EXTRA_DIST = gfapi.py
diff --git a/api/examples/README b/api/examples/README
new file mode 100644
index 000000000..4d2b521f7
--- /dev/null
+++ b/api/examples/README
@@ -0,0 +1,36 @@
+This is an example application which uses libgfapi. It is
+a complete autotools based build system which demonstrates the
+required changes in configure.ac, Makefile.am etc to successfuly
+detect for and build an application against libgfapi.
+
+There are two approaches to building a libgfapi based application:
+
+1. In the presence of pkg-config in your build system.
+This is the recommended approach which is also used in this example.
+For this approach to work, you need to build glusterfs by passing
+--pkgconfigdir=/usr/lib64/pkgconfig (or the appropriate directory)
+in your distro. This already happens if you build RPMs with the
+glusterfs.spec provided in glusterfs.git. You will also need to
+install glusterfs-api RPM.
+
+2. In the absence of pkg-config in your build system.
+Make sure your LDFLAGS includes -L/path/to/lib where libgfapi.so is
+installed and -I/path/to/include/glusterfs where the 'api' directory
+containing the headers are available.
+
+glfsxmp.c
+=========
+
+glfsxmp.c is an example application which uses libgfapi
+
+Compilation Steps For glfsxmp.c
+===============================
+
+1. $./autogen.sh
+2. $./configure
+
+Note: Before running ./configure , as mentioned above, you need to
+      take care of #1 or #2 i.e. pkg-config path or LDFLAGS and
+      -I/<path> with correct values.
+
+3. $make glfsxmp
diff --git a/api/examples/autogen.sh b/api/examples/autogen.sh
new file mode 100755
index 000000000..1fee6be11
--- /dev/null
+++ b/api/examples/autogen.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+aclocal
+autoconf
+automake --foreign
diff --git a/api/examples/configure.ac b/api/examples/configure.ac
new file mode 100644
index 000000000..b80177a4e
--- /dev/null
+++ b/api/examples/configure.ac
@@ -0,0 +1,12 @@
+
+AC_INIT([glfs-test],[0.1],[gluster-devel@nongu.org])
+
+AM_INIT_AUTOMAKE
+
+AC_CONFIG_FILES([Makefile])
+
+AC_PROG_CC
+
+PKG_CHECK_MODULES([GLFS], [glusterfs-api >= 3])
+
+AC_OUTPUT
diff --git a/api/examples/gfapi.py b/api/examples/gfapi.py
new file mode 100755
index 000000000..3ac67f4d5
--- /dev/null
+++ b/api/examples/gfapi.py
@@ -0,0 +1,422 @@
+#!/usr/bin/python
+
+from ctypes import *
+from ctypes.util import find_library
+import os
+import sys
+import time
+import types
+
+# Looks like ctypes is having trouble with dependencies, so just force them to
+# load with RTLD_GLOBAL until I figure that out.
+glfs = CDLL(find_library("glusterfs"),RTLD_GLOBAL)
+xdr = CDLL(find_library("gfxdr"),RTLD_GLOBAL)
+api = CDLL(find_library("gfapi"),RTLD_GLOBAL)
+
+# Wow, the Linux kernel folks really play nasty games with this structure.  If
+# you look at the man page for stat(2) and then at this definition you'll note
+# two discrepancies.  First, we seem to have st_nlink and st_mode reversed.  In
+# fact that's exactly how they're defined *for 64-bit systems*; for 32-bit
+# they're in the man-page order.  Even uglier, the man page makes no mention of
+# the *nsec fields, but they are very much present and if they're not included
+# then we get memory corruption because libgfapi has a structure definition
+# that's longer than ours and they overwrite some random bit of memory after
+# the space we allocated.  Yes, that's all very disgusting, and I'm still not
+# sure this will really work on 32-bit because all of the field types are so
+# obfuscated behind macros and feature checks.
+class Stat (Structure):
+        _fields_ = [
+                ("st_dev",              c_ulong),
+                ("st_ino",              c_ulong),
+                ("st_nlink",            c_ulong),
+                ("st_mode",             c_uint),
+                ("st_uid",              c_uint),
+                ("st_gid",              c_uint),
+                ("st_rdev",             c_ulong),
+                ("st_size",             c_ulong),
+                ("st_blksize",          c_ulong),
+                ("st_blocks",           c_ulong),
+                ("st_atime",            c_ulong),
+                ("st_atimensec",        c_ulong),
+                ("st_mtime",            c_ulong),
+                ("st_mtimensec",        c_ulong),
+                ("st_ctime",            c_ulong),
+                ("st_ctimensec",        c_ulong),
+        ]
+api.glfs_creat.restype = c_void_p
+api.glfs_open.restype = c_void_p
+api.glfs_lstat.restype = c_int
+api.glfs_lstat.argtypes = [c_void_p, c_char_p, POINTER(Stat)]
+
+class Dirent (Structure):
+        _fields_ = [
+                ("d_ino",       c_ulong),
+                ("d_off",       c_ulong),
+                ("d_reclen",    c_ushort),
+                ("d_type",      c_char),
+                ("d_name",      c_char * 256),
+        ]
+api.glfs_opendir.restype = c_void_p
+api.glfs_readdir_r.restype = c_int
+api.glfs_readdir_r.argtypes = [c_void_p, POINTER(Dirent),
+                               POINTER(POINTER(Dirent))]
+
+# There's a bit of ctypes glitchiness around __del__ functions and module-level
+# variables.  If we unload the module while we still have references to File or
+# Volume objects, the module-level variables might have disappeared by the time
+# __del__ gets called.  Therefore the objects hold references which they
+# release when __del__ is done.  We only actually use the object-local values
+# in __del__; for clarity, we just use the simpler module-level form elsewhere.
+
+class File(object):
+
+        def __init__ (self, fd):
+                # Add a reference so the module-level variable "api" doesn't
+                # get yanked out from under us (see comment above File def'n).
+                self._api = api
+                self.fd = fd
+
+        def __del__ (self):
+                self._api.glfs_close(self.fd)
+                self._api = None
+
+        # File operations, in alphabetical order.
+
+        def fsync (self):
+                return api.glfs_fsync(self.fd)
+
+        def read (self, buflen, flags=0):
+                rbuf = create_string_buffer(buflen)
+                rc = api.glfs_read(self.fd,rbuf,buflen,flags)
+                if rc > 0:
+                        return rbuf.value[:rc]
+                else:
+                        return rc
+
+        def read_buffer (self, buf, flags=0):
+                return api.glfs_read(self.fd,buf,len(buf),flags)
+
+        def write (self, data, flags=0):
+                return api.glfs_write(self.fd,data,len(data),flags)
+
+        def fallocate (self, mode, offset, len):
+            return api.glfs_fallocate(self.fd, mode, offset, len)
+
+        def discard (self, offset, len):
+            return api.glfs_discard(self.fd, offset, len)
+
+
+class Dir(object):
+
+        def __init__ (self, fd):
+                # Add a reference so the module-level variable "api" doesn't
+                # get yanked out from under us (see comment above File def'n).
+                self._api = api
+                self.fd = fd
+                self.cursor = POINTER(Dirent)()
+
+        def __del__ (self):
+                self._api.glfs_closedir(self.fd)
+                self._api = None
+
+        def next (self):
+                entry = Dirent()
+                entry.d_reclen = 256
+                rc = api.glfs_readdir_r(self.fd,byref(entry),byref(self.cursor))
+                if (rc < 0) or (not self.cursor) or (not self.cursor.contents):
+                        return rc
+                return entry
+
+class Volume(object):
+
+        # Housekeeping functions.
+
+        def __init__ (self, host, volid, proto="tcp", port=24007):
+                # Add a reference so the module-level variable "api" doesn't
+                # get yanked out from under us (see comment above File def'n).
+                self._api = api
+                self.fs = api.glfs_new(volid)
+                api.glfs_set_volfile_server(self.fs,proto,host,port)
+
+        def __del__ (self):
+                self._api.glfs_fini(self.fs)
+                self._api = None
+
+        def set_logging (self, path, level):
+                api.glfs_set_logging(self.fs,path,level)
+
+        def mount (self):
+                api.glfs_init(self.fs)
+
+        # File operations, in alphabetical order.
+
+        def creat (self, path, flags, mode):
+                fd = api.glfs_creat(self.fs,path,flags,mode)
+                if not fd:
+                        return fd
+                return File(fd)
+
+        def getxattr (self, path, key, maxlen):
+                buf = create_string_buffer(maxlen)
+                rc = api.glfs_getxattr(self.fs,path,key,buf,maxlen)
+                if rc < 0:
+                        return rc
+                return buf.value[:rc]
+
+        def listxattr (self, path):
+                buf = create_string_buffer(512)
+                rc = api.glfs_listxattr(self.fs,path,buf,512)
+                if rc < 0:
+                        return rc
+                xattrs = []
+                # Parsing character by character is ugly, but it seems like the
+                # easiest way to deal with the "strings separated by NUL in one
+                # buffer" format.
+                i = 0
+                while i < rc:
+                        new_xa = buf.raw[i]
+                        i += 1
+                        while i < rc:
+                                next_char = buf.raw[i]
+                                i += 1
+                                if next_char == '\0':
+                                        xattrs.append(new_xa)
+                                        break
+                                new_xa += next_char
+                xattrs.sort()
+                return xattrs
+
+        def lstat (self, path):
+                x = Stat()
+                rc = api.glfs_lstat(self.fs,path,byref(x))
+                if rc >= 0:
+                        return x
+                else:
+                        return rc
+
+        def mkdir (self, path):
+                return api.glfs_mkdir(self.fs,path)
+
+        def open (self, path, flags):
+                fd = api.glfs_open(self.fs,path,flags)
+                if not fd:
+                        return fd
+                return File(fd)
+
+        def opendir (self, path):
+                fd = api.glfs_opendir(self.fs,path)
+                if not fd:
+                        return fd
+                return Dir(fd)
+
+        def rename (self, opath, npath):
+                return api.glfs_rename(self.fs,opath,npath)
+
+        def rmdir (self, path):
+                return api.glfs_rmdir(self.fs,path)
+
+        def setxattr (self, path, key, value, vlen):
+                return api.glfs_setxattr(self.fs,path,key,value,vlen,0)
+
+        def unlink (self, path):
+                return api.glfs_unlink(self.fs,path)
+
+if __name__ == "__main__":
+        def test_create_write (vol, path, data):
+                mypath = path + ".io"
+                fd = vol.creat(mypath,os.O_WRONLY|os.O_EXCL,0644)
+                if not fd:
+                        return False, "creat error"
+                rc = fd.write(data)
+                if rc != len(data):
+                        return False, "wrote %d/%d bytes" % (rc, len(data))
+                return True, "wrote %d bytes" % rc
+
+        # TBD: this test fails if we do create, open, write, read
+        def test_open_read (vol, path, data):
+                mypath = path + ".io"
+                fd = vol.open(mypath,os.O_RDONLY)
+                if not fd:
+                        return False, "open error"
+                dlen = len(data) * 2
+                buf = fd.read(dlen)
+                if type(buf) == types.IntType:
+                        return False, "read error %d" % buf
+                if len(buf) != len(data):
+                        return False, "read %d/%d bytes" % (len(buf), len(data))
+                return True, "read '%s'" % buf
+
+        def test_lstat (vol, path, data):
+                mypath = path + ".io"
+                sb = vol.lstat(mypath)
+                if type(sb) == types.IntType:
+                        return False, "lstat error %d" % sb
+                if sb.st_size != len(data):
+                        return False, "lstat size is %d, expected %d" % (
+                                sb.st_size, len(data))
+                return True, "lstat got correct size %d" % sb.st_size
+
+        def test_rename (vol, path, data):
+                opath = path + ".io"
+                npath = path + ".tmp"
+                rc = vol.rename(opath,npath)
+                if rc < 0:
+                        return False, "rename error %d" % rc
+                ofd = vol.open(opath,os.O_RDWR)
+                if isinstance(ofd,File):
+                        return False, "old path working after rename"
+                nfd = vol.open(npath,os.O_RDWR)
+                if isinstance(nfd,File):
+                        return False, "new path not working after rename"
+                return True, "rename worked"
+
+        def test_unlink (vol, path, data):
+                mypath = path + ".tmp"
+                rc = vol.unlink(mypath)
+                if rc < 0:
+                        return False, "unlink error %d" % fd
+                fd = vol.open(mypath,os.O_RDWR)
+                if isinstance(fd,File):
+                        return False, "path still usable after unlink"
+                return True, "unlink worked"
+
+        def test_mkdir (vol, path, data):
+                mypath = path + ".dir"
+                rc = vol.mkdir(mypath)
+                if rc < 0:
+                        return False, "mkdir error %d" % rc
+                return True, "mkdir worked"
+
+        def test_create_in_dir (vol, path, data):
+                mypath = path + ".dir/probe"
+                fd = vol.creat(mypath,os.O_RDWR,0644)
+                if not isinstance(fd,File):
+                        return False, "create (in dir) error"
+                return True, "create (in dir) worked"
+
+        def test_dir_listing (vol, path, data):
+                mypath = path + ".dir"
+                fd = vol.opendir(mypath)
+                if not isinstance(fd,Dir):
+                        return False, "opendir error %d" % fd
+                files = []
+                while True:
+                        ent = fd.next()
+                        if not isinstance(ent,Dirent):
+                                break
+                        name = ent.d_name[:ent.d_reclen]
+                        files.append(name)
+                if files != [".", "..", "probe"]:
+                        return False, "wrong directory contents"
+                return True, "directory listing worked"
+
+        def test_unlink_in_dir (vol, path, data):
+                mypath = path + ".dir/probe"
+                rc = vol.unlink(mypath)
+                if rc < 0:
+                        return False, "unlink (in dir) error %d" % rc
+                return True, "unlink (in dir) worked"
+
+        def test_rmdir (vol, path, data):
+                mypath = path + ".dir"
+                rc = vol.rmdir(mypath)
+                if rc < 0:
+                        return False, "rmdir error %d" % rc
+                sb = vol.lstat(mypath)
+                if not isinstance(sb,Stat):
+                        return False, "dir still there after rmdir"
+                return True, "rmdir worked"
+
+        def test_setxattr (vol, path, data):
+                mypath = path + ".xa"
+                fd = vol.creat(mypath,os.O_RDWR|os.O_EXCL,0644)
+                if not fd:
+                        return False, "creat (xattr test) error"
+                key1, key2 = "hello", "goodbye"
+                if vol.setxattr(mypath,"trusted.key1",key1,len(key1)) < 0:
+                        return False, "setxattr (key1) error"
+                if vol.setxattr(mypath,"trusted.key2",key2,len(key2)) < 0:
+                        return False, "setxattr (key2) error"
+                return True, "setxattr worked"
+
+        def test_getxattr (vol, path, data):
+                mypath = path + ".xa"
+                buf = vol.getxattr(mypath,"trusted.key1",32)
+                if type(buf) == types.IntType:
+                        return False, "getxattr error"
+                if buf != "hello":
+                        return False, "wrong getxattr value %s" % buf
+                return True, "getxattr worked"
+
+        def test_listxattr (vol, path, data):
+                mypath = path + ".xa"
+                xattrs = vol.listxattr(mypath)
+                if type(xattrs) == types.IntType:
+                        return False, "listxattr error"
+                if xattrs != ["trusted.key1","trusted.key2"]:
+                        return False, "wrong listxattr value %s" % repr(xattrs)
+                return True, "listxattr worked"
+
+        def test_fallocate (vol, path, data):
+                mypath = path + ".io"
+                fd = vol.creat(mypath,os.O_WRONLY|os.O_EXCL,0644)
+                if not fd:
+                        return False, "creat error"
+		rc = fd.fallocate(0, 0, 1024*1024)
+                if rc != 0:
+                        return False, "fallocate error"
+		rc = fd.discard(4096, 4096)
+		if rc != 0:
+			return False, "discard error"
+                return True, "fallocate/discard worked"
+
+        test_list = (
+                test_create_write,
+                test_open_read,
+                test_lstat,
+                test_rename,
+                test_unlink,
+                test_mkdir,
+                test_create_in_dir,
+                test_dir_listing,
+                test_unlink_in_dir,
+                test_rmdir,
+                test_setxattr,
+                test_getxattr,
+                test_listxattr,
+                test_fallocate,
+        )
+
+        ok_to_fail = (
+                # TBD: this fails opening the new file, even though the file
+                # did get renamed.  Looks like a gfapi bug, not ours.
+                (test_rename, "new path not working after rename"),
+                # TBD: similar, call returns error even though it worked
+                (test_rmdir, "dir still there after rmdir"),
+        )
+
+        volid, path = sys.argv[1:3]
+        data = "fubar"
+        vol = Volume("localhost",volid)
+        vol.set_logging("/dev/null",7)
+        #vol.set_logging("/dev/stderr",7)
+        vol.mount()
+
+        failures = 0
+        expected = 0
+        for t in test_list:
+                rc, msg = t(vol,path,data)
+                if rc:
+                        print "PASS: %s" % msg
+                else:
+                        print "FAIL: %s" % msg
+                        failures += 1
+                        for otf in ok_to_fail:
+                                if (t == otf[0]) and (msg == otf[1]):
+                                        print "  (skipping known failure)"
+                                        expected += 1
+                                        break # from the *inner* for loop
+                        else:
+                                break # from the *outer* for loop
+
+        print "%d failures (%d expected)" % (failures, expected)
diff --git a/api/examples/glfsxmp.c b/api/examples/glfsxmp.c
new file mode 100644
index 000000000..600d72fb5
--- /dev/null
+++ b/api/examples/glfsxmp.c
@@ -0,0 +1,1598 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include "api/glfs.h"
+#include "api/glfs-handles.h"
+#include <string.h>
+#include <time.h>
+
+
+int
+test_dirops (glfs_t *fs)
+{
+        glfs_fd_t *fd = NULL;
+        char buf[512];
+        struct dirent *entry = NULL;
+
+        fd = glfs_opendir (fs, "/");
+        if (!fd) {
+                fprintf (stderr, "/: %s\n", strerror (errno));
+                return -1;
+        }
+
+        fprintf (stderr, "Entries:\n");
+        while (glfs_readdir_r (fd, (struct dirent *)buf, &entry), entry) {
+                fprintf (stderr, "%s: %lu\n", entry->d_name, glfs_telldir (fd));
+        }
+
+        glfs_closedir (fd);
+        return 0;
+}
+
+
+int
+test_xattr (glfs_t *fs)
+{
+        char *filename = "/filename2";
+        char buf[512];
+        char *ptr;
+        int ret;
+
+        ret = glfs_setxattr (fs, filename, "user.testkey", "testval", 8, 0);
+        fprintf (stderr, "setxattr(%s): %d (%s)\n", filename, ret,
+                 strerror (errno));
+
+        ret = glfs_setxattr (fs, filename, "user.testkey2", "testval", 8, 0);
+        fprintf (stderr, "setxattr(%s): %d (%s)\n", filename, ret,
+                 strerror (errno));
+
+        ret = glfs_listxattr (fs, filename, buf, 512);
+        fprintf (stderr, "listxattr(%s): %d (%s)\n", filename, ret,
+                 strerror (errno));
+        if (ret < 0)
+                return -1;
+
+        for (ptr = buf; ptr < buf + ret; ptr++) {
+                printf ("key=%s\n", ptr);
+                ptr += strlen (ptr);
+        }
+
+        return 0;
+}
+
+
+int
+test_chdir (glfs_t *fs)
+{
+        int ret = -1;
+        char *topdir = "/topdir";
+        char *linkdir = "/linkdir";
+        char *subdir = "./subdir";
+        char *respath = NULL;
+        char pathbuf[4096];
+
+        ret = glfs_mkdir (fs, topdir, 0755);
+        if (ret) {
+                fprintf (stderr, "mkdir(%s): %s\n", topdir, strerror (errno));
+                return -1;
+        }
+
+        respath = glfs_getcwd (fs, pathbuf, 4096);
+        fprintf (stdout, "getcwd() = %s\n", respath);
+
+        ret = glfs_symlink (fs, topdir, linkdir);
+        if (ret) {
+                fprintf (stderr, "symlink(%s, %s): %s\n", topdir, linkdir, strerror (errno));
+                return -1;
+        }
+
+        ret = glfs_chdir (fs, linkdir);
+        if (ret) {
+                fprintf (stderr, "chdir(%s): %s\n", linkdir, strerror (errno));
+                return -1;
+        }
+
+        respath = glfs_getcwd (fs, pathbuf, 4096);
+        fprintf (stdout, "getcwd() = %s\n", respath);
+
+        respath = glfs_realpath (fs, subdir, pathbuf);
+        if (respath) {
+                fprintf (stderr, "realpath(%s) worked unexpectedly: %s\n", subdir, respath);
+                return -1;
+        }
+
+        ret = glfs_mkdir (fs, subdir, 0755);
+        if (ret) {
+                fprintf (stderr, "mkdir(%s): %s\n", subdir, strerror (errno));
+                return -1;
+        }
+
+        respath = glfs_realpath (fs, subdir, pathbuf);
+        if (!respath) {
+                fprintf (stderr, "realpath(%s): %s\n", subdir, strerror (errno));
+        } else {
+                fprintf (stdout, "realpath(%s) = %s\n", subdir, respath);
+        }
+
+        ret = glfs_chdir (fs, subdir);
+        if (ret) {
+                fprintf (stderr, "chdir(%s): %s\n", subdir, strerror (errno));
+                return -1;
+        }
+
+        respath = glfs_getcwd (fs, pathbuf, 4096);
+        fprintf (stdout, "getcwd() = %s\n", respath);
+
+        respath = glfs_realpath (fs, "/linkdir/subdir", pathbuf);
+        if (!respath) {
+                fprintf (stderr, "realpath(/linkdir/subdir): %s\n", strerror (errno));
+        } else {
+                fprintf (stdout, "realpath(/linkdir/subdir) = %s\n", respath);
+        }
+
+        return 0;
+}
+
+#ifdef DEBUG
+static void
+peek_stat (struct stat *sb)
+{
+        printf ("Dumping stat information:\n");
+        printf ("File type:                ");
+
+        switch (sb->st_mode & S_IFMT) {
+                case S_IFBLK:  printf ("block device\n");            break;
+                case S_IFCHR:  printf ("character device\n");        break;
+                case S_IFDIR:  printf ("directory\n");               break;
+                case S_IFIFO:  printf ("FIFO/pipe\n");               break;
+                case S_IFLNK:  printf ("symlink\n");                 break;
+                case S_IFREG:  printf ("regular file\n");            break;
+                case S_IFSOCK: printf ("socket\n");                  break;
+                default:       printf ("unknown?\n");                break;
+        }
+
+        printf ("I-node number:            %ld\n", (long) sb->st_ino);
+
+        printf ("Mode:                     %lo (octal)\n",
+                (unsigned long) sb->st_mode);
+
+        printf ("Link count:               %ld\n", (long) sb->st_nlink);
+        printf ("Ownership:                UID=%ld   GID=%ld\n",
+                (long) sb->st_uid, (long) sb->st_gid);
+
+        printf ("Preferred I/O block size: %ld bytes\n",
+                (long) sb->st_blksize);
+        printf ("File size:                %lld bytes\n",
+                (long long) sb->st_size);
+        printf ("Blocks allocated:         %lld\n",
+                (long long) sb->st_blocks);
+
+        printf ("Last status change:       %s", ctime(&sb->st_ctime));
+        printf ("Last file access:         %s", ctime(&sb->st_atime));
+        printf ("Last file modification:   %s", ctime(&sb->st_mtime));
+
+        return;
+}
+
+static void
+peek_handle (unsigned char *glid)
+{
+        int i;
+
+        for (i = 0; i < GFAPI_HANDLE_LENGTH; i++)
+        {
+                printf (":%02x:", glid[i]);
+        }
+        printf ("\n");
+}
+#else /* DEBUG */
+static void
+peek_stat (struct stat *sb)
+{
+        return;
+}
+
+static void
+peek_handle (unsigned char *id)
+{
+        return;
+}
+#endif /* DEBUG */
+
+glfs_t    *fs = NULL;
+char      *full_parent_name = "/testdir", *parent_name = "testdir";
+
+void
+test_h_unlink (void)
+{
+        char               *my_dir = "unlinkdir";
+        char               *my_file = "file.txt";
+        char               *my_subdir = "dir1";
+        struct glfs_object *parent = NULL, *leaf = NULL, *dir = NULL,
+                           *subdir = NULL, *subleaf = NULL;
+        struct stat         sb;
+        int                 ret;
+
+        printf ("glfs_h_unlink tests: In Progress\n");
+
+        /* Prepare tests */
+        parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+        if (parent == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         full_parent_name, NULL, strerror (errno));
+                printf ("glfs_h_lookupat tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        dir = glfs_h_mkdir (fs, parent, my_dir, 0644, &sb);
+        if (dir == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+                         my_dir, parent, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+
+        leaf = glfs_h_creat (fs, dir, my_file, O_CREAT, 0644, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+                         my_file, dir, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+
+        subdir = glfs_h_mkdir (fs, dir, my_subdir, 0644, &sb);
+        if (subdir == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+                         my_subdir, dir, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+
+        subleaf = glfs_h_creat (fs, subdir, my_file, O_CREAT, 0644, &sb);
+        if (subleaf == NULL) {
+                fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+                         my_file, subdir, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+
+        /* unlink non empty directory */
+        ret = glfs_h_unlink (fs, dir, my_subdir);
+        if ((ret && errno != ENOTEMPTY) || (ret == 0)) {
+                fprintf (stderr, "glfs_h_unlink: error unlinking %s: it is non empty: %s\n",
+                         my_subdir, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+
+        /* unlink regular file */
+        ret = glfs_h_unlink (fs, subdir, my_file);
+        if (ret) {
+                fprintf (stderr, "glfs_h_unlink: error unlinking %s: from (%p),%s\n",
+                         my_file, subdir, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+
+        /* unlink directory */
+        ret = glfs_h_unlink (fs, dir, my_subdir);
+        if (ret) {
+                fprintf (stderr, "glfs_h_unlink: error unlinking %s: from (%p),%s\n",
+                         my_subdir, dir, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+
+        /* unlink regular file */
+        ret = glfs_h_unlink (fs, dir, my_file);
+        if (ret) {
+                fprintf (stderr, "glfs_h_unlink: error unlinking %s: from (%p),%s\n",
+                         my_file, dir, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+
+        /* unlink non-existant regular file */
+        ret = glfs_h_unlink (fs, dir, my_file);
+        if ((ret && errno != ENOENT) || (ret == 0)) {
+                fprintf (stderr, "glfs_h_unlink: error unlinking non-existant %s: invalid errno ,%d, %s\n",
+                         my_file, ret, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+
+        /* unlink non-existant directory */
+        ret = glfs_h_unlink (fs, dir, my_subdir);
+        if ((ret && errno != ENOENT) || (ret == 0)) {
+                fprintf (stderr, "glfs_h_unlink: error unlinking non-existant %s:  invalid errno ,%d, %s\n",
+                         my_subdir, ret, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+
+        /* unlink directory */
+        ret = glfs_h_unlink (fs, parent, my_dir);
+        if (ret) {
+                fprintf (stderr, "glfs_h_unlink: error unlinking %s: from (%p),%s\n",
+                         my_dir, dir, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+
+        printf ("glfs_h_unlink tests: PASSED\n");
+
+out:
+        if (dir)
+                glfs_h_close (dir);
+        if (leaf)
+                glfs_h_close (leaf);
+        if (subdir)
+                glfs_h_close (subdir);
+        if (subleaf)
+                glfs_h_close (subleaf);
+        if (parent)
+                glfs_h_close (parent);
+
+        return;
+}
+
+void
+test_h_getsetattrs (void)
+{
+        char               *my_dir = "attrdir";
+        char               *my_file = "attrfile.txt";
+        struct glfs_object *parent = NULL, *leaf = NULL, *dir = NULL;
+        struct stat         sb, retsb;
+        int                 ret, valid;
+        struct timespec     timestamp;
+
+        printf("glfs_h_getattrs and setattrs tests: In Progress\n");
+
+        /* Prepare tests */
+        parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+        if (parent == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         full_parent_name, NULL, strerror (errno));
+                printf ("glfs_h_lookupat tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        dir = glfs_h_mkdir (fs, parent, my_dir, 0644, &sb);
+        if (dir == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+                         my_dir, parent, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        leaf = glfs_h_creat (fs, dir, my_file, O_CREAT, 0644, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+                         my_file, dir, strerror (errno));
+                printf ("glfs_h_unlink tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        ret = glfs_h_getattrs (fs, dir, &retsb);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_getattrs: error %s: from (%p),%s\n",
+                         my_dir, dir, strerror (errno));
+                printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&retsb);
+        /* TODO: Compare stat information */
+
+        retsb.st_mode = 00666;
+        retsb.st_uid = 1000;
+        retsb.st_gid = 1001;
+        ret = clock_gettime (CLOCK_REALTIME, &timestamp);
+        if(ret != 0) {
+                fprintf (stderr, "clock_gettime: error %s\n", strerror (errno));
+                printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                goto out;
+        }
+        retsb.st_atim = timestamp;
+        retsb.st_mtim = timestamp;
+        valid = GFAPI_SET_ATTR_MODE | GFAPI_SET_ATTR_UID | GFAPI_SET_ATTR_GID |
+        GFAPI_SET_ATTR_ATIME | GFAPI_SET_ATTR_MTIME;
+        peek_stat (&retsb);
+
+        ret = glfs_h_setattrs (fs, dir, &retsb, valid);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_setattrs: error %s: from (%p),%s\n",
+                         my_dir, dir, strerror (errno));
+                printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                goto out;
+        }
+
+        memset(&retsb, 0, sizeof (struct stat));
+        ret = glfs_h_stat (fs, dir, &retsb);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_stat: error %s: from (%p),%s\n",
+                         my_dir, dir, strerror (errno));
+                printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&retsb);
+
+        printf ("glfs_h_getattrs and setattrs tests: PASSED\n");
+out:
+        if (parent)
+                glfs_h_close (parent);
+        if (leaf)
+                glfs_h_close (leaf);
+        if (dir)
+                glfs_h_close (dir);
+
+        return;
+}
+
+void
+test_h_truncate (void)
+{
+        char               *my_dir = "truncatedir";
+        char               *my_file = "file.txt";
+        struct glfs_object *root = NULL, *parent = NULL, *leaf = NULL;
+        struct stat         sb;
+        glfs_fd_t          *fd = NULL;
+        char                buf[32];
+        off_t               offset = 0;
+        int                 ret = 0;
+
+        printf("glfs_h_truncate tests: In Progress\n");
+
+        /* Prepare tests */
+        root = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+        if (root == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         full_parent_name, NULL, strerror (errno));
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        parent = glfs_h_mkdir (fs, root, my_dir, 0644, &sb);
+        if (parent == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+                         my_dir, root, strerror (errno));
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        leaf = glfs_h_creat (fs, parent, my_file, O_CREAT, 0644, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+                         my_file, parent, strerror (errno));
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        fd = glfs_h_open (fs, leaf, O_RDWR);
+        if (fd == NULL) {
+                fprintf (stderr, "glfs_h_open: error on open of %s: %s\n",
+                         my_file, strerror (errno));
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+
+        memcpy (buf, "abcdefghijklmnopqrstuvwxyz012345", 32);
+        ret = glfs_write (fd, buf, 32, 0);
+
+        /* run tests */
+        /* truncate lower */
+        offset = 30;
+        ret = glfs_h_truncate (fs, leaf, offset);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_truncate: error creating %s: from (%p),%s\n",
+                         my_file, parent, strerror (errno));
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+        ret = glfs_h_getattrs (fs, leaf, &sb);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_getattrs: error for %s (%p),%s\n",
+                         my_file, leaf, strerror (errno));
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+        if (sb.st_size != offset) {
+                fprintf (stderr, "glfs_h_truncate: post size mismatch\n");
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+
+        /* truncate higher */
+        offset = 32;
+        ret = glfs_h_truncate (fs, leaf, offset);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_truncate: error creating %s: from (%p),%s\n",
+                         my_file, parent, strerror (errno));
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+        ret = glfs_h_getattrs (fs, leaf, &sb);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_getattrs: error for %s (%p),%s\n",
+                         my_file, leaf, strerror (errno));
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+        if (sb.st_size != offset) {
+                fprintf (stderr, "glfs_h_truncate: post size mismatch\n");
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+
+        /* truncate equal */
+        offset = 30;
+        ret = glfs_h_truncate (fs, leaf, offset);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_truncate: error creating %s: from (%p),%s\n",
+                         my_file, parent, strerror (errno));
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+        ret = glfs_h_getattrs (fs, leaf, &sb);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_getattrs: error for %s (%p),%s\n",
+                         my_file, leaf, strerror (errno));
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+        if (sb.st_size != offset) {
+                fprintf (stderr, "glfs_h_truncate: post size mismatch\n");
+                printf ("glfs_h_truncate tests: FAILED\n");
+                goto out;
+        }
+
+        printf ("glfs_h_truncate tests: PASSED\n");
+out:
+        if (fd)
+                glfs_close (fd);
+        if (root)
+                glfs_h_close (root);
+        if (parent)
+                glfs_h_close (parent);
+        if (leaf)
+                glfs_h_close (leaf);
+
+        return;
+}
+
+void
+test_h_links (void)
+{
+        char               *my_dir = "linkdir";
+        char               *my_file = "file.txt";
+        char               *my_symlnk = "slnk.txt";
+        char               *my_lnk = "lnk.txt";
+        char               *linksrc_dir = "dir1";
+        char               *linktgt_dir = "dir2";
+        struct glfs_object *root = NULL, *parent = NULL, *leaf = NULL,
+                           *dirsrc = NULL, *dirtgt = NULL, *dleaf = NULL;
+        struct glfs_object *ln1 = NULL;
+        struct stat         sb;
+        int                 ret;
+        char               *buf = NULL;
+
+        printf("glfs_h_link(s) tests: In Progress\n");
+
+        /* Prepare tests */
+        root = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+        if (root == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         full_parent_name, NULL, strerror (errno));
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        parent = glfs_h_mkdir (fs, root, my_dir, 0644, &sb);
+        if (parent == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+                         my_dir, root, strerror (errno));
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        leaf = glfs_h_creat (fs, parent, my_file, O_CREAT, 0644, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+                         my_file, parent, strerror (errno));
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        dirsrc = glfs_h_mkdir (fs, parent, linksrc_dir, 0644, &sb);
+        if (dirsrc == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+                         linksrc_dir, parent, strerror (errno));
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        dirtgt = glfs_h_mkdir (fs, parent, linktgt_dir, 0644, &sb);
+        if (dirtgt == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+                         linktgt_dir, parent, strerror (errno));
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        dleaf = glfs_h_creat (fs, dirsrc, my_file, O_CREAT, 0644, &sb);
+        if (dleaf == NULL) {
+                fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+                         my_file, dirsrc, strerror (errno));
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        /* run tests */
+        /* sym link: /testdir/linkdir/file.txt to ./slnk.txt */
+        ln1 = glfs_h_symlink (fs, parent, my_symlnk, "./file.txt", &sb);
+        if (ln1 == NULL) {
+                fprintf (stderr, "glfs_h_symlink: error creating %s: from (%p),%s\n",
+                         my_symlnk, parent, strerror (errno));
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        buf = calloc (1024, sizeof(char));
+        if (buf == NULL) {
+                fprintf (stderr, "Error allocating memory\n");
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+
+        ret = glfs_h_readlink (fs, ln1, buf, 1024);
+        if (ret <= 0) {
+                fprintf (stderr, "glfs_h_readlink: error reading %s: from (%p),%s\n",
+                         my_symlnk, ln1, strerror (errno));
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+        if (!(strncmp (buf, my_symlnk, strlen (my_symlnk)))) {
+                fprintf (stderr, "glfs_h_readlink: error mismatch in link name: actual %s: retrieved %s\n",
+                         my_symlnk, buf);
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+
+        /* link: /testdir/linkdir/file.txt to ./lnk.txt */
+        ret = glfs_h_link (fs, leaf, parent, my_lnk);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_link: error creating %s: from (%p),%s\n",
+                         my_lnk, parent, strerror (errno));
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+        /* TODO: Should write content to a file and read from the link */
+
+        /* link: /testdir/linkdir/dir1/file.txt to ../dir2/slnk.txt */
+        ret = glfs_h_link (fs, dleaf, dirtgt, my_lnk);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_link: error creating %s: from (%p),%s\n",
+                         my_lnk, dirtgt, strerror (errno));
+                printf ("glfs_h_link(s) tests: FAILED\n");
+                goto out;
+        }
+        /* TODO: Should write content to a file and read from the link */
+
+        printf ("glfs_h_link(s) tests: PASSED\n");
+
+out:
+        if (root)
+                glfs_h_close (root);
+        if (parent)
+                glfs_h_close (parent);
+        if (leaf)
+                glfs_h_close (leaf);
+        if (dirsrc)
+                glfs_h_close (dirsrc);
+        if (dirtgt)
+                glfs_h_close (dirtgt);
+        if (dleaf)
+                glfs_h_close (dleaf);
+        if (ln1)
+                glfs_h_close (ln1);
+        if (buf)
+                free (buf);
+
+        return;
+}
+
+void
+test_h_rename (void)
+{
+        char               *my_dir = "renamedir";
+        char               *my_file = "file.txt";
+        char               *src_dir = "dir1";
+        char               *tgt_dir = "dir2";
+        struct glfs_object *root = NULL, *parent = NULL, *leaf = NULL,
+                           *dirsrc = NULL, *dirtgt = NULL, *dleaf = NULL;
+        struct stat         sb;
+        int                 ret;
+
+        printf("glfs_h_rename tests: In Progress\n");
+
+        /* Prepare tests */
+        root = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+        if (root == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         full_parent_name, NULL, strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        parent = glfs_h_mkdir (fs, root, my_dir, 0644, &sb);
+        if (parent == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+                         my_dir, root, strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        leaf = glfs_h_creat (fs, parent, my_file, O_CREAT, 0644, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+                         my_file, parent, strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        dirsrc = glfs_h_mkdir (fs, parent, src_dir, 0644, &sb);
+        if (dirsrc == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+                         src_dir, parent, strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        dirtgt = glfs_h_mkdir (fs, parent, tgt_dir, 0644, &sb);
+        if (dirtgt == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+                         tgt_dir, parent, strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        dleaf = glfs_h_creat (fs, dirsrc, my_file, O_CREAT, 0644, &sb);
+        if (dleaf == NULL) {
+                fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+                         my_file, dirsrc, strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        /* run tests */
+        /* Rename file.txt -> file1.txt */
+        ret = glfs_h_rename (fs, parent, "file.txt", parent, "file1.txt");
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_rename: error renaming %s to %s (%s)\n",
+                         "file.txt", "file1.txt", strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+
+        /* rename dir1/file.txt -> file.txt */
+        ret = glfs_h_rename (fs, dirsrc, "file.txt", parent, "file.txt");
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_rename: error renaming %s/%s to %s (%s)\n",
+                         src_dir, "file.txt", "file.txt", strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+
+        /* rename file1.txt -> file.txt (exists) */
+        ret = glfs_h_rename (fs, parent, "file1.txt", parent, "file.txt");
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_rename: error renaming %s to %s (%s)\n",
+                         "file.txt", "file.txt", strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+
+        /* rename dir1 -> dir3 */
+        ret = glfs_h_rename (fs, parent, "dir1", parent, "dir3");
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_rename: error renaming %s to %s (%s)\n",
+                         "dir1", "dir3", strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+
+        /* rename dir2 ->dir3 (exists) */
+        ret = glfs_h_rename (fs, parent, "dir2", parent, "dir3");
+        if (ret != 0) {
+                fprintf (stderr, "glfs_h_rename: error renaming %s to %s (%s)\n",
+                         "dir2", "dir3", strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+
+        /* rename file.txt -> dir3 (fail) */
+        ret = glfs_h_rename (fs, parent, "file.txt", parent, "dir3");
+        if (ret == 0) {
+                fprintf (stderr, "glfs_h_rename: NO error renaming %s to %s (%s)\n",
+                         "file.txt", "dir3", strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+
+        /* rename dir3 -> file.txt (fail) */
+        ret = glfs_h_rename (fs, parent, "dir3", parent, "file.txt");
+        if (ret == 0) {
+                fprintf (stderr, "glfs_h_rename: NO error renaming %s to %s (%s)\n",
+                         "dir3", "file.txt", strerror (errno));
+                printf ("glfs_h_rename tests: FAILED\n");
+                goto out;
+        }
+
+        printf ("glfs_h_rename tests: PASSED\n");
+
+out:
+        if (root)
+                glfs_h_close (root);
+        if (parent)
+                glfs_h_close (parent);
+        if (leaf)
+                glfs_h_close (leaf);
+        if (dirsrc)
+                glfs_h_close (dirsrc);
+        if (dirtgt)
+                glfs_h_close (dirtgt);
+        if (dleaf)
+                glfs_h_close (dleaf);
+
+        return;
+}
+
+void
+assimilatetime (struct timespec *ts, struct timespec ts_st,
+                struct timespec ts_ed)
+{
+        if ((ts_ed.tv_nsec - ts_st.tv_nsec) < 0) {
+                ts->tv_sec += ts_ed.tv_sec - ts_st.tv_sec - 1;
+                ts->tv_nsec += 1000000000 + ts_ed.tv_nsec - ts_st.tv_nsec;
+        } else {
+                ts->tv_sec += ts_ed.tv_sec - ts_st.tv_sec;
+                ts->tv_nsec += ts_ed.tv_nsec - ts_st.tv_nsec;
+        }
+
+        if (ts->tv_nsec > 1000000000) {
+                ts->tv_nsec = ts->tv_nsec - 1000000000;
+                ts->tv_sec += 1;
+        }
+
+        return;
+}
+
+#define MAX_FILES_CREATE 10
+#define MAXPATHNAME      512
+void
+test_h_performance (void)
+{
+        char               *my_dir = "perftest",
+                           *full_dir_path="/testdir/perftest";
+        char               *my_file = "file_", my_file_name[MAXPATHNAME];
+        struct glfs_object *parent = NULL, *leaf = NULL, *dir = NULL;
+        struct stat         sb;
+        int                 ret, i;
+        struct glfs_fd     *fd;
+        struct timespec     c_ts = {0, 0}, c_ts_st, c_ts_ed;
+        struct timespec     o_ts = {0, 0}, o_ts_st, o_ts_ed;
+
+        printf("glfs_h_performance tests: In Progress\n");
+
+        /* Prepare tests */
+        parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+        if (parent == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         full_parent_name, NULL, strerror (errno));
+                printf ("glfs_h_performance tests: FAILED\n");
+                goto out;
+        }
+
+        dir = glfs_h_mkdir (fs, parent, my_dir, 0644, &sb);
+        if (dir == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error creating %s: from (%p),%s\n",
+                         my_dir, parent, strerror (errno));
+                printf ("glfs_h_performance tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        /* create performance */
+        ret = clock_gettime (CLOCK_REALTIME, &o_ts_st);
+        if(ret != 0) {
+                fprintf (stderr, "clock_gettime: error %s\n", strerror (errno));
+                printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                goto out;
+        }
+
+        for (i = 0; i < MAX_FILES_CREATE; i++) {
+                sprintf (my_file_name, "%s%d", my_file, i);
+
+                ret = clock_gettime (CLOCK_REALTIME, &c_ts_st);
+                if(ret != 0) {
+                        fprintf (stderr, "clock_gettime: error %s\n",
+                                 strerror (errno));
+                        printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                        goto out;
+                }
+
+                leaf = glfs_h_lookupat (fs, dir, my_file_name, &sb);
+                if (leaf != NULL) {
+                        fprintf (stderr, "glfs_h_lookup: exists %s\n",
+                                 my_file_name);
+                        printf ("glfs_h_performance tests: FAILED\n");
+                        goto out;
+                }
+
+                leaf = glfs_h_creat (fs, dir, my_file_name, O_CREAT, 0644, &sb);
+                if (leaf == NULL) {
+                        fprintf (stderr, "glfs_h_creat: error creating %s: from (%p),%s\n",
+                                 my_file, dir, strerror (errno));
+                        printf ("glfs_h_performance tests: FAILED\n");
+                        goto out;
+                }
+
+                ret = clock_gettime (CLOCK_REALTIME, &c_ts_ed);
+                if(ret != 0) {
+                        fprintf (stderr, "clock_gettime: error %s\n",
+                                 strerror (errno));
+                        printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                        goto out;
+                }
+
+                assimilatetime (&c_ts, c_ts_st, c_ts_ed);
+                glfs_h_close (leaf); leaf = NULL;
+        }
+
+        ret = clock_gettime (CLOCK_REALTIME, &o_ts_ed);
+        if(ret != 0) {
+                fprintf (stderr, "clock_gettime: error %s\n", strerror (errno));
+                printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                goto out;
+        }
+
+        assimilatetime (&o_ts, o_ts_st, o_ts_ed);
+
+        printf ("Creation performance (handle based):\n\t# empty files:%d\n",
+                MAX_FILES_CREATE);
+        printf ("\tOverall time:\n\t\tSecs:%ld\n\t\tnSecs:%ld\n",
+                o_ts.tv_sec, o_ts.tv_nsec);
+        printf ("\tcreate call time time:\n\t\tSecs:%ld\n\t\tnSecs:%ld\n",
+                c_ts.tv_sec, c_ts.tv_nsec);
+
+        /* create using path */
+        c_ts.tv_sec = o_ts.tv_sec = 0;
+        c_ts.tv_nsec = o_ts.tv_nsec = 0;
+
+        sprintf (my_file_name, "%s1", full_dir_path);
+        ret = glfs_mkdir (fs, my_file_name, 0644);
+        if (ret != 0) {
+                fprintf (stderr, "glfs_mkdir: error creating %s: from (%p),%s\n",
+                         my_dir, parent, strerror (errno));
+                printf ("glfs_h_performance tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        ret = clock_gettime (CLOCK_REALTIME, &o_ts_st);
+        if(ret != 0) {
+                fprintf (stderr, "clock_gettime: error %s\n", strerror (errno));
+                printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                goto out;
+        }
+
+        for (i = 0; i < MAX_FILES_CREATE; i++) {
+                sprintf (my_file_name, "%s1/%sn%d", full_dir_path, my_file, i);
+
+                ret = clock_gettime (CLOCK_REALTIME, &c_ts_st);
+                if(ret != 0) {
+                        fprintf (stderr, "clock_gettime: error %s\n",
+                                 strerror (errno));
+                        printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                        goto out;
+                }
+
+                ret = glfs_stat (fs, my_file_name, &sb);
+                if (ret == 0) {
+                        fprintf (stderr, "glfs_stat: exists %s\n",
+                                 my_file_name);
+                        printf ("glfs_h_performance tests: FAILED\n");
+                        goto out;
+                }
+
+                fd = glfs_creat (fs, my_file_name, O_CREAT, 0644);
+                if (fd == NULL) {
+                        fprintf (stderr, "glfs_creat: error creating %s: from (%p),%s\n",
+                                 my_file, dir, strerror (errno));
+                        printf ("glfs_h_performance tests: FAILED\n");
+                        goto out;
+                }
+
+                ret = clock_gettime (CLOCK_REALTIME, &c_ts_ed);
+                if(ret != 0) {
+                        fprintf (stderr, "clock_gettime: error %s\n",
+                                 strerror (errno));
+                        printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                        goto out;
+                }
+
+                assimilatetime (&c_ts, c_ts_st, c_ts_ed);
+                glfs_close (fd);
+        }
+
+        ret = clock_gettime (CLOCK_REALTIME, &o_ts_ed);
+        if(ret != 0) {
+                fprintf (stderr, "clock_gettime: error %s\n", strerror (errno));
+                printf ("glfs_h_getattrs and setattrs tests: FAILED\n");
+                goto out;
+        }
+
+        assimilatetime (&o_ts, o_ts_st, o_ts_ed);
+
+        printf ("Creation performance (path based):\n\t# empty files:%d\n",
+                MAX_FILES_CREATE);
+        printf ("\tOverall time:\n\t\tSecs:%ld\n\t\tnSecs:%ld\n",
+                o_ts.tv_sec, o_ts.tv_nsec);
+        printf ("\tcreate call time time:\n\t\tSecs:%ld\n\t\tnSecs:%ld\n",
+                c_ts.tv_sec, c_ts.tv_nsec);
+out:
+        return;
+}
+
+int
+test_handleops (int argc, char *argv[])
+{
+        int                 ret = 0;
+        glfs_fd_t          *fd = NULL;
+        struct stat         sb = {0, };
+        struct glfs_object *root = NULL, *parent = NULL, *leaf = NULL,
+                           *tmp = NULL;
+        char                readbuf[32], writebuf[32];
+        unsigned char       leaf_handle[GFAPI_HANDLE_LENGTH];
+
+        char *full_leaf_name = "/testdir/testfile.txt",
+             *leaf_name = "testfile.txt",
+             *relative_leaf_name = "testdir/testfile.txt";
+        char *leaf_name1 = "testfile1.txt";
+        char *full_newparent_name = "/testdir/dir1",
+             *newparent_name = "dir1";
+        char *full_newnod_name = "/testdir/nod1",
+             *newnod_name = "nod1";
+
+        /* Initialize test area */
+        ret = glfs_mkdir (fs, full_parent_name, 0644);
+        if (ret != 0 && errno != EEXIST) {
+                fprintf (stderr, "%s: (%p) %s\n", full_parent_name, fd,
+                        strerror (errno));
+                printf ("Test initialization failed on volume %s\n", argv[1]);
+                goto out;
+        }
+        else if (ret != 0) {
+                printf ("Found test directory %s to be existing\n",
+                        full_parent_name);
+                printf ("Cleanup test directory and restart tests\n");
+                goto out;
+        }
+
+        fd = glfs_creat (fs, full_leaf_name, O_CREAT, 0644);
+        if (fd == NULL) {
+                fprintf (stderr, "%s: (%p) %s\n", full_leaf_name, fd,
+                        strerror (errno));
+                printf ("Test initialization failed on volume %s\n", argv[1]);
+                goto out;
+        }
+        glfs_close (fd);
+
+        printf ("Initialized the test area, within volume %s\n", argv[1]);
+
+        /* Handle based APIs test area */
+
+        /* glfs_lookupat test */
+        printf ("glfs_h_lookupat tests: In Progress\n");
+        /* start at root of the volume */
+        root = glfs_h_lookupat (fs, NULL, "/", &sb);
+        if (root == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         "/", NULL, strerror (errno));
+                printf ("glfs_h_lookupat tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        /* lookup a parent within root */
+        parent = glfs_h_lookupat (fs, root, parent_name, &sb);
+        if (parent == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         parent_name, root, strerror (errno));
+                printf ("glfs_h_lookupat tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        /* lookup a leaf/child within the parent */
+        leaf = glfs_h_lookupat (fs, parent, leaf_name, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         leaf_name, parent, strerror (errno));
+                printf ("glfs_h_lookupat tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        /* reset */
+        glfs_h_close (root); root = NULL;
+        glfs_h_close (leaf); leaf = NULL;
+        glfs_h_close (parent); parent = NULL;
+
+        /* check absolute paths */
+        root = glfs_h_lookupat (fs, NULL, "/", &sb);
+        if (root == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         "/", NULL, strerror (errno));
+                printf ("glfs_h_lookupat tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+        if (parent == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         full_parent_name, root, strerror (errno));
+                printf ("glfs_h_lookupat tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        leaf = glfs_h_lookupat (fs, NULL, full_leaf_name, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         full_leaf_name, parent, strerror (errno));
+                printf ("glfs_h_lookupat tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        /* reset */
+        glfs_h_close (leaf); leaf = NULL;
+
+        /* check multiple component paths */
+        leaf = glfs_h_lookupat (fs, root, relative_leaf_name, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         relative_leaf_name, parent, strerror (errno));
+                goto out;
+        }
+        peek_stat (&sb);
+
+        /* reset */
+        glfs_h_close (root); root = NULL;
+        glfs_h_close (parent); parent = NULL;
+
+        /* check symlinks in path */
+
+        /* TODO: -ve test cases */
+        /* parent invalid
+        * path invalid
+        * path does not exist after some components
+        * no parent, but relative path
+        * parent and full path? -ve?
+        */
+
+        printf ("glfs_h_lookupat tests: PASSED\n");
+
+        /* glfs_openat test */
+        printf ("glfs_h_open tests: In Progress\n");
+        fd = glfs_h_open (fs, leaf, O_RDWR);
+        if (fd == NULL) {
+                fprintf (stderr, "glfs_h_open: error on open of %s: %s\n",
+                         full_leaf_name, strerror (errno));
+                printf ("glfs_h_open tests: FAILED\n");
+                goto out;
+        }
+
+        /* test read/write based on fd */
+        memcpy (writebuf, "abcdefghijklmnopqrstuvwxyz012345", 32);
+        ret = glfs_write (fd, writebuf, 32, 0);
+
+        glfs_lseek (fd, 0, SEEK_SET);
+
+        ret = glfs_read (fd, readbuf, 32, 0);
+        if (memcmp (readbuf, writebuf, 32)) {
+                printf ("Failed to read what I wrote: %s %s\n", readbuf,
+                        writebuf);
+                glfs_close (fd);
+                printf ("glfs_h_open tests: FAILED\n");
+                goto out;
+        }
+
+        glfs_h_close (leaf); leaf = NULL;
+        glfs_close (fd);
+
+        printf ("glfs_h_open tests: PASSED\n");
+
+        /* Create tests */
+        printf ("glfs_h_creat tests: In Progress\n");
+        parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+        if (parent == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         full_parent_name, root, strerror (errno));
+                printf ("glfs_h_creat tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        leaf = glfs_h_creat (fs, parent, leaf_name1, O_CREAT, 0644, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_creat: error on create of %s: from (%p),%s\n",
+                         leaf_name1, parent, strerror (errno));
+                printf ("glfs_h_creat tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        glfs_h_close (leaf); leaf = NULL;
+
+        leaf = glfs_h_creat (fs, parent, leaf_name1, O_CREAT | O_EXCL, 0644,
+                            &sb);
+        if (leaf != NULL || errno != EEXIST) {
+                fprintf (stderr, "glfs_h_creat: existing file, leaf = (%p), errno = %s\n",
+                        leaf, strerror (errno));
+                printf ("glfs_h_creat tests: FAILED\n");
+                if (leaf != NULL) {
+                        glfs_h_close (leaf); leaf = NULL;
+                }
+        }
+
+        tmp = glfs_h_creat (fs, root, parent_name, O_CREAT, 0644, &sb);
+        if (tmp != NULL || !(errno == EISDIR || errno == EINVAL)) {
+                fprintf (stderr, "glfs_h_creat: dir create, tmp = (%p), errno = %s\n",
+                        leaf, strerror (errno));
+                printf ("glfs_h_creat tests: FAILED\n");
+                if (tmp != NULL) {
+                        glfs_h_close (tmp); tmp = NULL;
+                }
+        }
+
+        /* TODO: Other combinations and -ve cases as applicable */
+        printf ("glfs_h_creat tests: PASSED\n");
+
+        /* extract handle and create from handle test */
+        printf ("glfs_h_extract_handle and glfs_h_create_from_handle tests: In Progress\n");
+        /* TODO: Change the lookup to creat below for a GIFD recovery falure,
+         * that needs to be fixed */
+        leaf = glfs_h_lookupat (fs, parent, leaf_name1, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         leaf_name1, parent, strerror (errno));
+                printf ("glfs_h_extract_handle tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        ret = glfs_h_extract_handle (leaf, leaf_handle,
+                                             GFAPI_HANDLE_LENGTH);
+        if (ret < 0) {
+                fprintf (stderr, "glfs_h_extract_handle: error extracting handle of %s: %s\n",
+                         full_leaf_name, strerror (errno));
+                printf ("glfs_h_extract_handle tests: FAILED\n");
+                goto out;
+        }
+        peek_handle (leaf_handle);
+
+        glfs_h_close (leaf); leaf = NULL;
+
+        leaf = glfs_h_create_from_handle (fs, leaf_handle, GFAPI_HANDLE_LENGTH,
+                                          &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_create_from_handle: error on create of %s: from (%p),%s\n",
+                         leaf_name1, leaf_handle, strerror (errno));
+                printf ("glfs_h_create_from_handle tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        fd = glfs_h_open (fs, leaf, O_RDWR);
+        if (fd == NULL) {
+                fprintf (stderr, "glfs_h_open: error on open of %s: %s\n",
+                         full_leaf_name, strerror (errno));
+                printf ("glfs_h_create_from_handle tests: FAILED\n");
+                goto out;
+        }
+
+        /* test read/write based on fd */
+        memcpy (writebuf, "abcdefghijklmnopqrstuvwxyz012345", 32);
+        ret = glfs_write (fd, writebuf, 32, 0);
+
+        glfs_lseek (fd, 0, SEEK_SET);
+
+        ret = glfs_read (fd, readbuf, 32, 0);
+        if (memcmp (readbuf, writebuf, 32)) {
+                printf ("Failed to read what I wrote: %s %s\n", writebuf,
+                        writebuf);
+                printf ("glfs_h_create_from_handle tests: FAILED\n");
+                glfs_close (fd);
+                goto out;
+        }
+
+        glfs_close (fd);
+        glfs_h_close (leaf); leaf = NULL;
+        glfs_h_close (parent); parent = NULL;
+
+        printf ("glfs_h_extract_handle and glfs_h_create_from_handle tests: PASSED\n");
+
+        /* Mkdir tests */
+        printf ("glfs_h_mkdir tests: In Progress\n");
+
+        ret = glfs_rmdir (fs, full_newparent_name);
+        if (ret && errno != ENOENT) {
+                fprintf (stderr, "glfs_rmdir: Failed for %s: %s\n",
+                         full_newparent_name, strerror (errno));
+                printf ("glfs_h_mkdir tests: FAILED\n");
+                goto out;
+        }
+
+        parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+        if (parent == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         full_parent_name, root, strerror (errno));
+                printf ("glfs_h_mkdir tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        leaf = glfs_h_mkdir (fs, parent, newparent_name, 0644, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error on mkdir of %s: from (%p),%s\n",
+                         newparent_name, parent, strerror (errno));
+                printf ("glfs_h_mkdir tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        glfs_h_close (leaf); leaf = NULL;
+
+        leaf = glfs_h_mkdir (fs, parent, newparent_name, 0644, &sb);
+        if (leaf != NULL || errno != EEXIST) {
+                fprintf (stderr, "glfs_h_mkdir: existing directory, leaf = (%p), errno = %s\n",
+                         leaf, strerror (errno));
+                printf ("glfs_h_mkdir tests: FAILED\n");
+                if (leaf != NULL) {
+                        glfs_h_close (leaf); leaf = NULL;
+                }
+        }
+
+        glfs_h_close (parent); parent = NULL;
+
+        printf ("glfs_h_mkdir tests: PASSED\n");
+
+        /* Mknod tests */
+        printf ("glfs_h_mknod tests: In Progress\n");
+        ret = glfs_unlink (fs, full_newnod_name);
+        if (ret && errno != ENOENT) {
+                fprintf (stderr, "glfs_unlink: Failed for %s: %s\n",
+                         full_newnod_name, strerror (errno));
+                printf ("glfs_h_mknod tests: FAILED\n");
+                goto out;
+        }
+
+        parent = glfs_h_lookupat (fs, NULL, full_parent_name, &sb);
+        if (parent == NULL) {
+                fprintf (stderr, "glfs_h_lookupat: error on lookup of %s: from (%p),%s\n",
+                         full_parent_name, root, strerror (errno));
+                printf ("glfs_h_mknod tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        leaf = glfs_h_mknod (fs, parent, newnod_name, S_IFIFO, 0, &sb);
+        if (leaf == NULL) {
+                fprintf (stderr, "glfs_h_mkdir: error on mkdir of %s: from (%p),%s\n",
+                         newnod_name, parent, strerror (errno));
+                printf ("glfs_h_mknod tests: FAILED\n");
+                goto out;
+        }
+        peek_stat (&sb);
+
+        /* TODO: creat op on a FIFO node hangs, need to check and fix
+        tmp = glfs_h_creat (fs, parent, newnod_name, O_CREAT, 0644, &sb);
+        if (tmp != NULL || errno != EINVAL) {
+                fprintf (stderr, "glfs_h_creat: node create, tmp = (%p), errno = %s\n",
+                        tmp, strerror (errno));
+                printf ("glfs_h_creat/mknod tests: FAILED\n");
+                if (tmp != NULL) {
+                        glfs_h_close(tmp); tmp = NULL;
+                }
+        } */
+
+        glfs_h_close (leaf); leaf = NULL;
+
+        leaf = glfs_h_mknod (fs, parent, newnod_name, 0644, 0, &sb);
+        if (leaf != NULL || errno != EEXIST) {
+                fprintf (stderr, "glfs_h_mknod: existing node, leaf = (%p), errno = %s\n",
+                         leaf, strerror (errno));
+                printf ("glfs_h_mknod tests: FAILED\n");
+                if (leaf != NULL) {
+                        glfs_h_close (leaf); leaf = NULL;
+                }
+        }
+
+        glfs_h_close (parent); parent = NULL;
+
+        printf ("glfs_h_mknod tests: PASSED\n");
+
+        /* unlink tests */
+        test_h_unlink ();
+
+        /* TODO: opendir tests */
+
+        /* getattr tests */
+        test_h_getsetattrs ();
+
+        /* TODO: setattr tests */
+
+        /* truncate tests */
+        test_h_truncate();
+
+        /* link tests */
+        test_h_links ();
+
+        /* rename tests */
+        test_h_rename ();
+
+        /* performance tests */
+        test_h_performance ();
+
+        /* END: New APIs test area */
+
+out:
+        /* Cleanup glfs handles */
+        if (root)
+                glfs_h_close (root);
+        if (parent)
+                glfs_h_close (parent);
+        if (leaf)
+                glfs_h_close (leaf);
+
+        return ret;
+}
+
+int
+main (int argc, char *argv[])
+{
+        glfs_t    *fs2 = NULL;
+        int        ret = 0;
+        glfs_fd_t *fd = NULL;
+        glfs_fd_t *fd2 = NULL;
+        struct stat sb = {0, };
+        char       readbuf[32];
+        char       writebuf[32];
+
+        char      *filename = "/filename2";
+
+        if (argc != 3) {
+                printf ("Expect following args\n\t%s <volname> <hostname>\n", argv[0]);
+                return -1;
+        }
+
+        fs = glfs_new (argv[1]);
+        if (!fs) {
+                fprintf (stderr, "glfs_new: returned NULL\n");
+                return 1;
+        }
+
+//      ret = glfs_set_volfile (fs, "/tmp/posix.vol");
+
+        ret = glfs_set_volfile_server (fs, "tcp", argv[2], 24007);
+
+//      ret = glfs_set_volfile_server (fs, "unix", "/tmp/gluster.sock", 0);
+
+        ret = glfs_set_logging (fs, "/dev/stderr", 7);
+
+        ret = glfs_init (fs);
+
+        fprintf (stderr, "glfs_init: returned %d\n", ret);
+
+        sleep (2);
+
+        fs2 = glfs_new (argv[1]);
+        if (!fs2) {
+                fprintf (stderr, "glfs_new: returned NULL\n");
+                return 1;
+        }
+
+
+//      ret = glfs_set_volfile (fs2, "/tmp/posix.vol");
+
+        ret = glfs_set_volfile_server (fs2, "tcp", argv[2], 24007);
+
+        ret = glfs_set_logging (fs2, "/dev/stderr", 7);
+
+        ret = glfs_init (fs2);
+
+        fprintf (stderr, "glfs_init: returned %d\n", ret);
+
+        ret = glfs_lstat (fs, filename, &sb);
+        fprintf (stderr, "%s: (%d) %s\n", filename, ret, strerror (errno));
+
+        fd = glfs_creat (fs, filename, O_RDWR, 0644);
+        fprintf (stderr, "%s: (%p) %s\n", filename, fd, strerror (errno));
+
+        fd2 = glfs_open (fs2, filename, O_RDWR);
+        fprintf (stderr, "%s: (%p) %s\n", filename, fd, strerror (errno));
+
+        sprintf (writebuf, "hi there\n");
+        ret = glfs_write (fd, writebuf, 32, 0);
+
+        glfs_lseek (fd2, 0, SEEK_SET);
+
+        ret = glfs_read (fd2, readbuf, 32, 0);
+
+        printf ("read %d, %s", ret, readbuf);
+
+        glfs_close (fd);
+        glfs_close (fd2);
+
+        filename = "/filename3";
+        ret = glfs_mknod (fs, filename, S_IFIFO, 0);
+        fprintf (stderr, "%s: (%d) %s\n", filename, ret, strerror (errno));
+
+        ret = glfs_lstat (fs, filename, &sb);
+        fprintf (stderr, "%s: (%d) %s\n", filename, ret, strerror (errno));
+
+
+        ret = glfs_rename (fs, filename, "/filename4");
+        fprintf (stderr, "rename(%s): (%d) %s\n", filename, ret,
+                 strerror (errno));
+
+        ret = glfs_unlink (fs, "/filename4");
+        fprintf (stderr, "unlink(%s): (%d) %s\n", "/filename4", ret,
+                 strerror (errno));
+
+        filename = "/dirname2";
+        ret = glfs_mkdir (fs, filename, 0);
+        fprintf (stderr, "%s: (%d) %s\n", filename, ret, strerror (errno));
+
+        ret = glfs_lstat (fs, filename, &sb);
+        fprintf (stderr, "lstat(%s): (%d) %s\n", filename, ret, strerror (errno));
+
+        ret = glfs_rmdir (fs, filename);
+        fprintf (stderr, "rmdir(%s): (%d) %s\n", filename, ret, strerror (errno));
+
+        test_dirops (fs);
+
+        test_xattr (fs);
+
+        test_chdir (fs);
+
+        test_handleops (argc, argv);
+        // done
+
+        glfs_fini (fs);
+        glfs_fini (fs2);
+
+        return ret;
+}
diff --git a/api/examples/setup.py.in b/api/examples/setup.py.in
new file mode 100644
index 000000000..44b738094
--- /dev/null
+++ b/api/examples/setup.py.in
@@ -0,0 +1,29 @@
+from distutils.core import setup
+
+# generate a __init__.py for the package namespace
+fo = open('__init__.py', 'w')
+fo.write('__version__ = "@PACKAGE_VERSION@"\n')
+fo.close()
+
+DESC = """GlusterFS is a clustered file-system capable of scaling to
+several petabytes. It aggregates various storage bricks over Infiniband
+RDMA or TCP/IP interconnect into one large parallel network file system.
+GlusterFS is one of the most sophisticated file systems in terms of
+features and extensibility.  It borrows a powerful concept called
+Translators from GNU Hurd kernel. Much of the code in GlusterFS is in
+user space and easily manageable.
+
+This package contains the Python interface to the libgfapi library."""
+
+setup(
+    name='glusterfs-api',
+    version='@PACKAGE_VERSION@',
+    description='Python client library for the GlusterFS libgfapi',
+    long_description=DESC,
+    author='Gluster Community',
+    author_email='gluster-devel@nongnu.org',
+    license='LGPLv3',
+    url='http://gluster.org/',
+    package_dir={'gluster':''},
+    packages=['gluster']
+)
diff --git a/api/src/Makefile.am b/api/src/Makefile.am
new file mode 100644
index 000000000..7c5df3e20
--- /dev/null
+++ b/api/src/Makefile.am
@@ -0,0 +1,36 @@
+lib_LTLIBRARIES = libgfapi.la
+noinst_HEADERS = glfs-mem-types.h glfs-internal.h
+libgfapi_HEADERS = glfs.h glfs-handles.h
+libgfapidir = $(includedir)/glusterfs/api
+
+libgfapi_la_SOURCES = glfs.c glfs-mgmt.c glfs-fops.c glfs-resolve.c \
+	glfs-handleops.c
+libgfapi_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+	$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
+	$(top_builddir)/rpc/xdr/src/libgfxdr.la \
+	$(GF_LDADD)
+
+libgfapi_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 \
+	-I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/rpc-lib/src \
+	-I$(top_srcdir)/rpc/xdr/src
+
+
+xlator_LTLIBRARIES = api.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/mount
+# workaround for broken parallel install support in automake with LTLIBRARIES
+# http://debbugs.gnu.org/cgi/bugreport.cgi?bug=7328
+install_xlatorLTLIBRARIES = install-xlatorLTLIBRARIES
+$(install_xlatorLTLIBRARIES): install-libLTLIBRARIES
+
+api_la_SOURCES = glfs-master.c
+api_la_DEPENDENCIES = libgfapi.la
+api_la_LDFLAGS = -module -avoid-version
+api_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+	$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
+	$(top_builddir)/rpc/xdr/src/libgfxdr.la \
+	$(top_builddir)/api/src/libgfapi.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/api/src/glfs-fops.c b/api/src/glfs-fops.c
new file mode 100644
index 000000000..10bb7d38b
--- /dev/null
+++ b/api/src/glfs-fops.c
@@ -0,0 +1,3252 @@
+/*
+  Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+
+#include "glfs-internal.h"
+#include "glfs-mem-types.h"
+#include "syncop.h"
+#include "glfs.h"
+#include <limits.h>
+
+#ifdef NAME_MAX
+#define GF_NAME_MAX NAME_MAX
+#else
+#define GF_NAME_MAX 255
+#endif
+
+#define READDIRBUF_SIZE (sizeof(struct dirent) + GF_NAME_MAX + 1)
+
+int
+glfs_loc_link (loc_t *loc, struct iatt *iatt)
+{
+	int ret = -1;
+	inode_t *linked_inode = NULL;
+
+	if (!loc->inode) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	linked_inode = inode_link (loc->inode, loc->parent, loc->name, iatt);
+	if (linked_inode) {
+		inode_lookup (linked_inode);
+		inode_unref (linked_inode);
+		ret = 0;
+	} else {
+		ret = -1;
+		errno = ENOMEM;
+	}
+
+	return ret;
+}
+
+
+void
+glfs_iatt_to_stat (struct glfs *fs, struct iatt *iatt, struct stat *stat)
+{
+	iatt_to_stat (iatt, stat);
+	stat->st_dev = fs->dev_id;
+}
+
+
+int
+glfs_loc_unlink (loc_t *loc)
+{
+	inode_unlink (loc->inode, loc->parent, loc->name);
+
+	return 0;
+}
+
+
+struct glfs_fd *
+glfs_open (struct glfs *fs, const char *path, int flags)
+{
+	int              ret = -1;
+	struct glfs_fd  *glfd = NULL;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	glfd = glfs_fd_new (fs);
+	if (!glfd)
+		goto out;
+
+retry:
+	ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	if (IA_ISDIR (iatt.ia_type)) {
+		ret = -1;
+		errno = EISDIR;
+		goto out;
+	}
+
+	if (!IA_ISREG (iatt.ia_type)) {
+		ret = -1;
+		errno = EINVAL;
+		goto out;
+	}
+
+	if (glfd->fd) {
+		/* Retry. Safe to touch glfd->fd as we
+		   still have not glfs_fd_bind() yet.
+		*/
+		fd_unref (glfd->fd);
+		glfd->fd = NULL;
+	}
+
+	glfd->fd = fd_create (loc.inode, getpid());
+	if (!glfd->fd) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	ret = syncop_open (subvol, &loc, flags, glfd->fd);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+out:
+	loc_wipe (&loc);
+
+	if (ret && glfd) {
+		glfs_fd_destroy (glfd);
+		glfd = NULL;
+	} else if (glfd) {
+                glfd->fd->flags = flags;
+		fd_bind (glfd->fd);
+		glfs_fd_bind (glfd);
+	}
+
+	glfs_subvol_done (fs, subvol);
+
+	return glfd;
+}
+
+
+int
+glfs_close (struct glfs_fd *glfd)
+{
+	xlator_t  *subvol = NULL;
+	int        ret = -1;
+	fd_t      *fd = NULL;
+	struct glfs *fs = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+        if (!subvol) {
+                ret = -1;
+                errno = EIO;
+                goto out;
+        }
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	ret = syncop_flush (subvol, fd);
+out:
+	fs = glfd->fs;
+	glfs_fd_destroy (glfd);
+
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_lstat (struct glfs *fs, const char *path, struct stat *stat)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret == 0 && stat)
+		glfs_iatt_to_stat (fs, &iatt, stat);
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_stat (struct glfs *fs, const char *path, struct stat *stat)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret == 0 && stat)
+		glfs_iatt_to_stat (fs, &iatt, stat);
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_fstat (struct glfs_fd *glfd, struct stat *stat)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	struct iatt      iatt = {0, };
+	fd_t            *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	ret = syncop_fstat (subvol, fd, &iatt);
+
+	if (ret == 0 && stat)
+		glfs_iatt_to_stat (glfd->fs, &iatt, stat);
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+struct glfs_fd *
+glfs_creat (struct glfs *fs, const char *path, int flags, mode_t mode)
+{
+	int              ret = -1;
+	struct glfs_fd  *glfd = NULL;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	uuid_t           gfid;
+	dict_t          *xattr_req = NULL;
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	xattr_req = dict_new ();
+	if (!xattr_req) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	uuid_generate (gfid);
+	ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+	if (ret) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	glfd = glfs_fd_new (fs);
+	if (!glfd)
+		goto out;
+
+	/* This must be glfs_resolve() and NOT glfs_lresolve().
+	   That is because open("name", O_CREAT) where "name"
+	   is a danging symlink must create the dangling
+	   destinataion.
+	*/
+retry:
+	ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret == -1 && errno != ENOENT)
+		/* Any other type of error is fatal */
+		goto out;
+
+	if (ret == -1 && errno == ENOENT && !loc.parent)
+		/* The parent directory or an ancestor even
+		   higher does not exist
+		*/
+		goto out;
+
+	if (loc.inode) {
+		if (flags & O_EXCL) {
+			ret = -1;
+			errno = EEXIST;
+			goto out;
+		}
+
+		if (IA_ISDIR (iatt.ia_type)) {
+			ret = -1;
+			errno = EISDIR;
+			goto out;
+		}
+
+		if (!IA_ISREG (iatt.ia_type)) {
+			ret = -1;
+			errno = EINVAL;
+			goto out;
+		}
+	}
+
+	if (ret == -1 && errno == ENOENT) {
+		loc.inode = inode_new (loc.parent->table);
+		if (!loc.inode) {
+			ret = -1;
+			errno = ENOMEM;
+			goto out;
+		}
+	}
+
+	if (glfd->fd) {
+		/* Retry. Safe to touch glfd->fd as we
+		   still have not glfs_fd_bind() yet.
+		*/
+		fd_unref (glfd->fd);
+		glfd->fd = NULL;
+	}
+
+	glfd->fd = fd_create (loc.inode, getpid());
+	if (!glfd->fd) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	if (ret == 0) {
+		ret = syncop_open (subvol, &loc, flags, glfd->fd);
+	} else {
+		ret = syncop_create (subvol, &loc, flags, mode, glfd->fd,
+				     xattr_req, &iatt);
+	}
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret == 0)
+		ret = glfs_loc_link (&loc, &iatt);
+out:
+	loc_wipe (&loc);
+
+	if (xattr_req)
+		dict_unref (xattr_req);
+
+	if (ret && glfd) {
+		glfs_fd_destroy (glfd);
+		glfd = NULL;
+	} else if (glfd) {
+                glfd->fd->flags = flags;
+		fd_bind (glfd->fd);
+		glfs_fd_bind (glfd);
+	}
+
+	glfs_subvol_done (fs, subvol);
+
+	return glfd;
+}
+
+
+off_t
+glfs_lseek (struct glfs_fd *glfd, off_t offset, int whence)
+{
+	struct stat sb = {0, };
+	int         ret = -1;
+
+	__glfs_entry_fd (glfd);
+
+	switch (whence) {
+	case SEEK_SET:
+		glfd->offset = offset;
+		break;
+	case SEEK_CUR:
+		glfd->offset += offset;
+		break;
+	case SEEK_END:
+		ret = glfs_fstat (glfd, &sb);
+		if (ret) {
+			/* seek cannot fail :O */
+			break;
+		}
+		glfd->offset = sb.st_size + offset;
+		break;
+	}
+
+	return glfd->offset;
+}
+
+
+//////////////
+
+ssize_t
+glfs_preadv (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
+	     off_t offset, int flags)
+{
+	xlator_t       *subvol = NULL;
+	ssize_t         ret = -1;
+	ssize_t         size = -1;
+	struct iovec   *iov = NULL;
+	int             cnt = 0;
+	struct iobref  *iobref = NULL;
+	fd_t           *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	size = iov_length (iovec, iovcnt);
+
+	ret = syncop_readv (subvol, fd, size, offset, 0, &iov, &cnt, &iobref);
+	if (ret <= 0)
+		goto out;
+
+	size = iov_copy (iovec, iovcnt, iov, cnt); /* FIXME!!! */
+
+	glfd->offset = (offset + size);
+
+	ret = size;
+out:
+        if (iov)
+                GF_FREE (iov);
+        if (iobref)
+                iobref_unref (iobref);
+
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+ssize_t
+glfs_read (struct glfs_fd *glfd, void *buf, size_t count, int flags)
+{
+	struct iovec iov = {0, };
+	ssize_t      ret = 0;
+
+	iov.iov_base = buf;
+	iov.iov_len = count;
+
+	ret = glfs_preadv (glfd, &iov, 1, glfd->offset, flags);
+
+	return ret;
+}
+
+
+ssize_t
+glfs_pread (struct glfs_fd *glfd, void *buf, size_t count, off_t offset,
+	    int flags)
+{
+	struct iovec iov = {0, };
+	ssize_t      ret = 0;
+
+	iov.iov_base = buf;
+	iov.iov_len = count;
+
+	ret = glfs_preadv (glfd, &iov, 1, offset, flags);
+
+	return ret;
+}
+
+
+ssize_t
+glfs_readv (struct glfs_fd *glfd, const struct iovec *iov, int count,
+	    int flags)
+{
+	ssize_t      ret = 0;
+
+	ret = glfs_preadv (glfd, iov, count, glfd->offset, flags);
+
+	return ret;
+}
+
+
+struct glfs_io {
+	struct glfs_fd      *glfd;
+	int                  op;
+	off_t                offset;
+	struct iovec        *iov;
+	int                  count;
+	int                  flags;
+	glfs_io_cbk          fn;
+	void                *data;
+};
+
+
+static int
+glfs_io_async_cbk (int ret, call_frame_t *frame, void *data)
+{
+	struct glfs_io  *gio = data;
+
+	gio->fn (gio->glfd, ret, gio->data);
+
+	GF_FREE (gio->iov);
+	GF_FREE (gio);
+
+	return 0;
+}
+
+
+static int
+glfs_io_async_task (void *data)
+{
+	struct glfs_io *gio = data;
+	ssize_t         ret = 0;
+
+	switch (gio->op) {
+	case GF_FOP_WRITE:
+		ret = glfs_pwritev (gio->glfd, gio->iov, gio->count,
+				    gio->offset, gio->flags);
+		break;
+	case GF_FOP_FTRUNCATE:
+		ret = glfs_ftruncate (gio->glfd, gio->offset);
+		break;
+	case GF_FOP_FSYNC:
+		if (gio->flags)
+			ret = glfs_fdatasync (gio->glfd);
+		else
+			ret = glfs_fsync (gio->glfd);
+		break;
+	case GF_FOP_DISCARD:
+		ret = glfs_discard (gio->glfd, gio->offset, gio->count);
+		break;
+        case GF_FOP_ZEROFILL:
+                ret = glfs_zerofill(gio->glfd, gio->offset, gio->count);
+                break;
+	}
+
+	return (int) ret;
+}
+
+
+int
+glfs_preadv_async_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+		       int op_ret, int op_errno, struct iovec *iovec,
+		       int count, struct iatt *stbuf, struct iobref *iobref,
+		       dict_t *xdata)
+{
+	struct glfs_io *gio = NULL;
+	xlator_t       *subvol = NULL;
+	struct glfs    *fs = NULL;
+	struct glfs_fd *glfd = NULL;
+
+
+	gio = frame->local;
+	frame->local = NULL;
+	subvol = cookie;
+	glfd = gio->glfd;
+	fs = glfd->fs;
+
+	if (op_ret <= 0)
+		goto out;
+
+	op_ret = iov_copy (gio->iov, gio->count, iovec, count);
+
+	glfd->offset = gio->offset + op_ret;
+out:
+	errno = op_errno;
+	gio->fn (gio->glfd, op_ret, gio->data);
+
+	GF_FREE (gio->iov);
+	GF_FREE (gio);
+	STACK_DESTROY (frame->root);
+	glfs_subvol_done (fs, subvol);
+
+	return 0;
+}
+
+
+int
+glfs_preadv_async (struct glfs_fd *glfd, const struct iovec *iovec, int count,
+		   off_t offset, int flags, glfs_io_cbk fn, void *data)
+{
+	struct glfs_io *gio = NULL;
+	int             ret = 0;
+	call_frame_t   *frame = NULL;
+	xlator_t       *subvol = NULL;
+	glfs_t         *fs = NULL;
+	fd_t           *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	fs = glfd->fs;
+
+	frame = syncop_create_frame (THIS);
+	if (!frame) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	gio = GF_CALLOC (1, sizeof (*gio), glfs_mt_glfs_io_t);
+	if (!gio) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	gio->iov = iov_dup (iovec, count);
+	if (!gio->iov) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	gio->op     = GF_FOP_READ;
+	gio->glfd   = glfd;
+	gio->count  = count;
+	gio->offset = offset;
+	gio->flags  = flags;
+	gio->fn     = fn;
+	gio->data   = data;
+
+	frame->local = gio;
+
+	STACK_WIND_COOKIE (frame, glfs_preadv_async_cbk, subvol, subvol,
+			   subvol->fops->readv, fd, iov_length (iovec, count),
+			   offset, flags, NULL);
+
+out:
+	if (ret) {
+		GF_FREE (gio->iov);
+		GF_FREE (gio);
+		STACK_DESTROY (frame->root);
+		glfs_subvol_done (fs, subvol);
+	}
+
+	if (fd)
+		fd_unref (fd);
+
+	return ret;
+}
+
+
+int
+glfs_read_async (struct glfs_fd *glfd, void *buf, size_t count, int flags,
+		 glfs_io_cbk fn, void *data)
+{
+	struct iovec iov = {0, };
+	ssize_t      ret = 0;
+
+	iov.iov_base = buf;
+	iov.iov_len = count;
+
+	ret = glfs_preadv_async (glfd, &iov, 1, glfd->offset, flags, fn, data);
+
+	return ret;
+}
+
+
+int
+glfs_pread_async (struct glfs_fd *glfd, void *buf, size_t count, off_t offset,
+		  int flags, glfs_io_cbk fn, void *data)
+{
+	struct iovec iov = {0, };
+	ssize_t      ret = 0;
+
+	iov.iov_base = buf;
+	iov.iov_len = count;
+
+	ret = glfs_preadv_async (glfd, &iov, 1, offset, flags, fn, data);
+
+	return ret;
+}
+
+
+int
+glfs_readv_async (struct glfs_fd *glfd, const struct iovec *iov, int count,
+		  int flags, glfs_io_cbk fn, void *data)
+{
+	ssize_t      ret = 0;
+
+	ret = glfs_preadv_async (glfd, iov, count, glfd->offset, flags,
+				 fn, data);
+	return ret;
+}
+
+///// writev /////
+
+ssize_t
+glfs_pwritev (struct glfs_fd *glfd, const struct iovec *iovec, int iovcnt,
+	      off_t offset, int flags)
+{
+	xlator_t       *subvol = NULL;
+	int             ret = -1;
+	size_t          size = -1;
+	struct iobref  *iobref = NULL;
+	struct iobuf   *iobuf = NULL;
+	struct iovec    iov = {0, };
+	fd_t           *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	size = iov_length (iovec, iovcnt);
+
+	iobuf = iobuf_get2 (subvol->ctx->iobuf_pool, size);
+	if (!iobuf) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	iobref = iobref_new ();
+	if (!iobref) {
+		iobuf_unref (iobuf);
+		errno = ENOMEM;
+		ret = -1;
+		goto out;
+	}
+
+	ret = iobref_add (iobref, iobuf);
+	if (ret) {
+		iobuf_unref (iobuf);
+		iobref_unref (iobref);
+		errno = ENOMEM;
+		ret = -1;
+		goto out;
+	}
+
+	iov_unload (iobuf_ptr (iobuf), iovec, iovcnt);  /* FIXME!!! */
+
+	iov.iov_base = iobuf_ptr (iobuf);
+	iov.iov_len = size;
+
+	ret = syncop_writev (subvol, fd, &iov, 1, offset, iobref, flags);
+
+	iobuf_unref (iobuf);
+	iobref_unref (iobref);
+
+	if (ret <= 0)
+		goto out;
+
+	glfd->offset = (offset + size);
+
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+ssize_t
+glfs_write (struct glfs_fd *glfd, const void *buf, size_t count, int flags)
+{
+	struct iovec iov = {0, };
+	ssize_t      ret = 0;
+
+	iov.iov_base = (void *) buf;
+	iov.iov_len = count;
+
+	ret = glfs_pwritev (glfd, &iov, 1, glfd->offset, flags);
+
+	return ret;
+}
+
+
+
+ssize_t
+glfs_writev (struct glfs_fd *glfd, const struct iovec *iov, int count,
+	     int flags)
+{
+	ssize_t      ret = 0;
+
+	ret = glfs_pwritev (glfd, iov, count, glfd->offset, flags);
+
+	return ret;
+}
+
+
+ssize_t
+glfs_pwrite (struct glfs_fd *glfd, const void *buf, size_t count, off_t offset,
+	     int flags)
+{
+	struct iovec iov = {0, };
+	ssize_t      ret = 0;
+
+	iov.iov_base = (void *) buf;
+	iov.iov_len = count;
+
+	ret = glfs_pwritev (glfd, &iov, 1, offset, flags);
+
+	return ret;
+}
+
+
+int
+glfs_pwritev_async (struct glfs_fd *glfd, const struct iovec *iovec, int count,
+		    off_t offset, int flags, glfs_io_cbk fn, void *data)
+{
+	struct glfs_io *gio = NULL;
+	int             ret = 0;
+
+	gio = GF_CALLOC (1, sizeof (*gio), glfs_mt_glfs_io_t);
+	if (!gio) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	gio->iov = iov_dup (iovec, count);
+	if (!gio->iov) {
+		GF_FREE (gio);
+		errno = ENOMEM;
+		return -1;
+	}
+
+	gio->op     = GF_FOP_WRITE;
+	gio->glfd   = glfd;
+	gio->count  = count;
+	gio->offset = offset;
+	gio->flags  = flags;
+	gio->fn     = fn;
+	gio->data   = data;
+
+	ret = synctask_new (glfs_from_glfd (glfd)->ctx->env,
+			    glfs_io_async_task, glfs_io_async_cbk,
+			    NULL, gio);
+
+	if (ret) {
+		GF_FREE (gio->iov);
+		GF_FREE (gio);
+	}
+
+	return ret;
+}
+
+
+int
+glfs_write_async (struct glfs_fd *glfd, const void *buf, size_t count, int flags,
+		  glfs_io_cbk fn, void *data)
+{
+	struct iovec iov = {0, };
+	ssize_t      ret = 0;
+
+	iov.iov_base = (void *) buf;
+	iov.iov_len = count;
+
+	ret = glfs_pwritev_async (glfd, &iov, 1, glfd->offset, flags, fn, data);
+
+	return ret;
+}
+
+
+int
+glfs_pwrite_async (struct glfs_fd *glfd, const void *buf, int count,
+		   off_t offset, int flags, glfs_io_cbk fn, void *data)
+{
+	struct iovec iov = {0, };
+	ssize_t      ret = 0;
+
+	iov.iov_base = (void *) buf;
+	iov.iov_len = count;
+
+	ret = glfs_pwritev_async (glfd, &iov, 1, offset, flags, fn, data);
+
+	return ret;
+}
+
+
+int
+glfs_writev_async (struct glfs_fd *glfd, const struct iovec *iov, int count,
+		   int flags, glfs_io_cbk fn, void *data)
+{
+	ssize_t      ret = 0;
+
+	ret = glfs_pwritev_async (glfd, iov, count, glfd->offset, flags,
+				  fn, data);
+	return ret;
+}
+
+
+int
+glfs_fsync (struct glfs_fd *glfd)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	fd_t            *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	ret = syncop_fsync (subvol, fd, 0);
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+static int
+glfs_fsync_async_common (struct glfs_fd *glfd, glfs_io_cbk fn, void *data,
+			 int dataonly)
+{
+	struct glfs_io *gio = NULL;
+	int             ret = 0;
+
+	gio = GF_CALLOC (1, sizeof (*gio), glfs_mt_glfs_io_t);
+	if (!gio) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	gio->op     = GF_FOP_FSYNC;
+	gio->glfd   = glfd;
+	gio->flags  = dataonly;
+	gio->fn     = fn;
+	gio->data   = data;
+
+	ret = synctask_new (glfs_from_glfd (glfd)->ctx->env,
+			    glfs_io_async_task, glfs_io_async_cbk,
+			    NULL, gio);
+
+	if (ret) {
+		GF_FREE (gio->iov);
+		GF_FREE (gio);
+	}
+
+	return ret;
+
+}
+
+
+int
+glfs_fsync_async (struct glfs_fd *glfd, glfs_io_cbk fn, void *data)
+{
+	return glfs_fsync_async_common (glfd, fn, data, 0);
+}
+
+
+int
+glfs_fdatasync (struct glfs_fd *glfd)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	fd_t            *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	ret = syncop_fsync (subvol, fd, 1);
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_fdatasync_async (struct glfs_fd *glfd, glfs_io_cbk fn, void *data)
+{
+	return glfs_fsync_async_common (glfd, fn, data, 1);
+}
+
+
+int
+glfs_ftruncate (struct glfs_fd *glfd, off_t offset)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	fd_t            *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	ret = syncop_ftruncate (subvol, fd, offset);
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_ftruncate_async (struct glfs_fd *glfd, off_t offset,
+		      glfs_io_cbk fn, void *data)
+{
+	struct glfs_io *gio = NULL;
+	int             ret = 0;
+
+	gio = GF_CALLOC (1, sizeof (*gio), glfs_mt_glfs_io_t);
+	if (!gio) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	gio->op     = GF_FOP_FTRUNCATE;
+	gio->glfd   = glfd;
+	gio->offset = offset;
+	gio->fn     = fn;
+	gio->data   = data;
+
+	ret = synctask_new (glfs_from_glfd (glfd)->ctx->env,
+			    glfs_io_async_task, glfs_io_async_cbk,
+			    NULL, gio);
+
+	if (ret) {
+		GF_FREE (gio->iov);
+		GF_FREE (gio);
+	}
+
+	return ret;
+}
+
+
+int
+glfs_access (struct glfs *fs, const char *path, int mode)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	ret = syncop_access (subvol, &loc, mode);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_symlink (struct glfs *fs, const char *data, const char *path)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	uuid_t           gfid;
+	dict_t          *xattr_req = NULL;
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	xattr_req = dict_new ();
+	if (!xattr_req) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	uuid_generate (gfid);
+	ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+	if (ret) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+retry:
+	ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (loc.inode) {
+		errno = EEXIST;
+		ret = -1;
+		goto out;
+	}
+
+	if (ret == -1 && errno != ENOENT)
+		/* Any other type of error is fatal */
+		goto out;
+
+	if (ret == -1 && errno == ENOENT && !loc.parent)
+		/* The parent directory or an ancestor even
+		   higher does not exist
+		*/
+		goto out;
+
+	/* ret == -1 && errno == ENOENT */
+	loc.inode = inode_new (loc.parent->table);
+	if (!loc.inode) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	ret = syncop_symlink (subvol, &loc, data, xattr_req, &iatt);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret == 0)
+		ret = glfs_loc_link (&loc, &iatt);
+out:
+	loc_wipe (&loc);
+
+	if (xattr_req)
+		dict_unref (xattr_req);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_readlink (struct glfs *fs, const char *path, char *buf, size_t bufsiz)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+	char            *linkval = NULL;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	if (iatt.ia_type != IA_IFLNK) {
+		ret = -1;
+		errno = EINVAL;
+		goto out;
+	}
+
+	ret = syncop_readlink (subvol, &loc, &linkval, bufsiz);
+	if (ret > 0) {
+		memcpy (buf, linkval, ret);
+		GF_FREE (linkval);
+	}
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_mknod (struct glfs *fs, const char *path, mode_t mode, dev_t dev)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	uuid_t           gfid;
+	dict_t          *xattr_req = NULL;
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	xattr_req = dict_new ();
+	if (!xattr_req) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	uuid_generate (gfid);
+	ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+	if (ret) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+retry:
+	ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (loc.inode) {
+		errno = EEXIST;
+		ret = -1;
+		goto out;
+	}
+
+	if (ret == -1 && errno != ENOENT)
+		/* Any other type of error is fatal */
+		goto out;
+
+	if (ret == -1 && errno == ENOENT && !loc.parent)
+		/* The parent directory or an ancestor even
+		   higher does not exist
+		*/
+		goto out;
+
+	/* ret == -1 && errno == ENOENT */
+	loc.inode = inode_new (loc.parent->table);
+	if (!loc.inode) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	ret = syncop_mknod (subvol, &loc, mode, dev, xattr_req, &iatt);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret == 0)
+		ret = glfs_loc_link (&loc, &iatt);
+out:
+	loc_wipe (&loc);
+
+	if (xattr_req)
+		dict_unref (xattr_req);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_mkdir (struct glfs *fs, const char *path, mode_t mode)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	uuid_t           gfid;
+	dict_t          *xattr_req = NULL;
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	xattr_req = dict_new ();
+	if (!xattr_req) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	uuid_generate (gfid);
+	ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+	if (ret) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+retry:
+	ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (loc.inode) {
+		errno = EEXIST;
+		ret = -1;
+		goto out;
+	}
+
+	if (ret == -1 && errno != ENOENT)
+		/* Any other type of error is fatal */
+		goto out;
+
+	if (ret == -1 && errno == ENOENT && !loc.parent)
+		/* The parent directory or an ancestor even
+		   higher does not exist
+		*/
+		goto out;
+
+	/* ret == -1 && errno == ENOENT */
+	loc.inode = inode_new (loc.parent->table);
+	if (!loc.inode) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	ret = syncop_mkdir (subvol, &loc, mode, xattr_req, &iatt);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret == 0)
+		ret = glfs_loc_link (&loc, &iatt);
+out:
+	loc_wipe (&loc);
+
+	if (xattr_req)
+		dict_unref (xattr_req);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_unlink (struct glfs *fs, const char *path)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	if (iatt.ia_type == IA_IFDIR) {
+		ret = -1;
+		errno = EISDIR;
+		goto out;
+	}
+
+	ret = syncop_unlink (subvol, &loc);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret == 0)
+		ret = glfs_loc_unlink (&loc);
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_rmdir (struct glfs *fs, const char *path)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	if (iatt.ia_type != IA_IFDIR) {
+		ret = -1;
+		errno = ENOTDIR;
+		goto out;
+	}
+
+	ret = syncop_rmdir (subvol, &loc);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret == 0)
+		ret = glfs_loc_unlink (&loc);
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_rename (struct glfs *fs, const char *oldpath, const char *newpath)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            oldloc = {0, };
+	loc_t            newloc = {0, };
+	struct iatt      oldiatt = {0, };
+	struct iatt      newiatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	ret = glfs_lresolve (fs, subvol, oldpath, &oldloc, &oldiatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &oldloc, retry);
+
+	if (ret)
+		goto out;
+retrynew:
+	ret = glfs_lresolve (fs, subvol, newpath, &newloc, &newiatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &newloc, retrynew);
+
+	if (ret && errno != ENOENT && newloc.parent)
+		goto out;
+
+	if (newiatt.ia_type != IA_INVAL) {
+                if ((oldiatt.ia_type == IA_IFDIR) !=
+                    (newiatt.ia_type == IA_IFDIR)) {
+                       /* Either both old and new must be dirs,
+                        * or both must be non-dirs. Else, fail.
+                        */
+                       ret = -1;
+                       errno = EISDIR;
+                       goto out;
+                }
+        }
+
+	/* TODO: check if new or old is a prefix of the other, and fail EINVAL */
+
+	ret = syncop_rename (subvol, &oldloc, &newloc);
+
+	if (ret == -1 && errno == ESTALE) {
+		if (reval < DEFAULT_REVAL_COUNT) {
+			reval++;
+			loc_wipe (&oldloc);
+			loc_wipe (&newloc);
+			goto retry;
+		}
+	}
+
+	if (ret == 0)
+		inode_rename (oldloc.parent->table, oldloc.parent, oldloc.name,
+			      newloc.parent, newloc.name, oldloc.inode,
+			      &oldiatt);
+out:
+	loc_wipe (&oldloc);
+	loc_wipe (&newloc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_link (struct glfs *fs, const char *oldpath, const char *newpath)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            oldloc = {0, };
+	loc_t            newloc = {0, };
+	struct iatt      oldiatt = {0, };
+	struct iatt      newiatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	ret = glfs_lresolve (fs, subvol, oldpath, &oldloc, &oldiatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &oldloc, retry);
+
+	if (ret)
+		goto out;
+retrynew:
+	ret = glfs_lresolve (fs, subvol, newpath, &newloc, &newiatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &newloc, retrynew);
+
+	if (ret == 0) {
+		ret = -1;
+		errno = EEXIST;
+		goto out;
+	}
+
+	if (oldiatt.ia_type == IA_IFDIR) {
+		ret = -1;
+		errno = EISDIR;
+		goto out;
+	}
+
+        /* Filling the inode of the hard link to be same as that of the
+           original file
+        */
+	if (newloc.inode) {
+		inode_unref (newloc.inode);
+		newloc.inode = NULL;
+	}
+        newloc.inode = inode_ref (oldloc.inode);
+
+	ret = syncop_link (subvol, &oldloc, &newloc);
+
+	if (ret == -1 && errno == ESTALE) {
+		loc_wipe (&oldloc);
+		loc_wipe (&newloc);
+		if (reval--)
+			goto retry;
+	}
+
+	if (ret == 0)
+		ret = glfs_loc_link (&newloc, &oldiatt);
+out:
+	loc_wipe (&oldloc);
+	loc_wipe (&newloc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+struct glfs_fd *
+glfs_opendir (struct glfs *fs, const char *path)
+{
+	int              ret = -1;
+	struct glfs_fd  *glfd = NULL;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	glfd = glfs_fd_new (fs);
+	if (!glfd)
+		goto out;
+
+	INIT_LIST_HEAD (&glfd->entries);
+retry:
+	ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	if (!IA_ISDIR (iatt.ia_type)) {
+		ret = -1;
+		errno = ENOTDIR;
+		goto out;
+	}
+
+	if (glfd->fd) {
+		/* Retry. Safe to touch glfd->fd as we
+		   still have not glfs_fd_bind() yet.
+		*/
+		fd_unref (glfd->fd);
+		glfd->fd = NULL;
+	}
+
+	glfd->fd = fd_create (loc.inode, getpid());
+	if (!glfd->fd) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	ret = syncop_opendir (subvol, &loc, glfd->fd);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+out:
+	loc_wipe (&loc);
+
+	if (ret && glfd) {
+		glfs_fd_destroy (glfd);
+		glfd = NULL;
+	} else {
+		fd_bind (glfd->fd);
+		glfs_fd_bind (glfd);
+	}
+
+	glfs_subvol_done (fs, subvol);
+
+	return glfd;
+}
+
+
+int
+glfs_closedir (struct glfs_fd *glfd)
+{
+	__glfs_entry_fd (glfd);
+
+	gf_dirent_free (list_entry (&glfd->entries, gf_dirent_t, list));
+
+	glfs_fd_destroy (glfd);
+
+	return 0;
+}
+
+
+long
+glfs_telldir (struct glfs_fd *fd)
+{
+	return fd->offset;
+}
+
+
+void
+glfs_seekdir (struct glfs_fd *fd, long offset)
+{
+	gf_dirent_t *entry = NULL;
+	gf_dirent_t *tmp = NULL;
+
+	if (fd->offset == offset)
+		return;
+
+	fd->offset = offset;
+	fd->next = NULL;
+
+	list_for_each_entry_safe (entry, tmp, &fd->entries, list) {
+		if (entry->d_off != offset)
+			continue;
+
+		if (&tmp->list != &fd->entries) {
+			/* found! */
+			fd->next = tmp;
+			return;
+		}
+	}
+	/* could not find entry at requested offset in the cache.
+	   next readdir_r() will result in glfd_entry_refresh()
+	*/
+}
+
+int
+glfs_discard_async (struct glfs_fd *glfd, off_t offset, size_t len,
+		      glfs_io_cbk fn, void *data)
+{
+	struct glfs_io *gio = NULL;
+	int             ret = 0;
+
+	gio = GF_CALLOC (1, sizeof (*gio), glfs_mt_glfs_io_t);
+	if (!gio) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	gio->op     = GF_FOP_DISCARD;
+	gio->glfd   = glfd;
+	gio->offset = offset;
+	gio->count  = len;
+	gio->fn     = fn;
+	gio->data   = data;
+
+	ret = synctask_new (glfs_from_glfd (glfd)->ctx->env,
+			    glfs_io_async_task, glfs_io_async_cbk,
+			    NULL, gio);
+
+	if (ret) {
+		GF_FREE (gio->iov);
+		GF_FREE (gio);
+	}
+
+	return ret;
+}
+
+int
+glfs_zerofill_async (struct glfs_fd *glfd, off_t offset, size_t len,
+                      glfs_io_cbk fn, void *data)
+{
+        struct glfs_io *gio  = NULL;
+        int             ret  = 0;
+
+        gio = GF_CALLOC (1, sizeof (*gio), glfs_mt_glfs_io_t);
+        if (!gio) {
+                errno = ENOMEM;
+                return -1;
+        }
+
+        gio->op     = GF_FOP_ZEROFILL;
+        gio->glfd   = glfd;
+        gio->offset = offset;
+        gio->count  = len;
+        gio->fn     = fn;
+        gio->data   = data;
+
+        ret = synctask_new (glfs_from_glfd (glfd)->ctx->env,
+                            glfs_io_async_task, glfs_io_async_cbk,
+                            NULL, gio);
+
+        if (ret) {
+                GF_FREE (gio->iov);
+                GF_FREE (gio);
+        }
+
+        return ret;
+}
+
+
+void
+gf_dirent_to_dirent (gf_dirent_t *gf_dirent, struct dirent *dirent)
+{
+	dirent->d_ino = gf_dirent->d_ino;
+
+#ifdef _DIRENT_HAVE_D_OFF
+	dirent->d_off = gf_dirent->d_off;
+#endif
+
+#ifdef _DIRENT_HAVE_D_TYPE
+	dirent->d_type = gf_dirent->d_type;
+#endif
+
+#ifdef _DIRENT_HAVE_D_NAMLEN
+	dirent->d_namlen = strlen (gf_dirent->d_name);
+#endif
+
+	strncpy (dirent->d_name, gf_dirent->d_name, GF_NAME_MAX + 1);
+}
+
+
+int
+glfd_entry_refresh (struct glfs_fd *glfd, int plus)
+{
+	xlator_t        *subvol = NULL;
+	gf_dirent_t      entries;
+	gf_dirent_t      old;
+	int              ret = -1;
+	fd_t            *fd = NULL;
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	if (fd->inode->ia_type != IA_IFDIR) {
+		ret = -1;
+		errno = EBADF;
+		goto out;
+	}
+
+	INIT_LIST_HEAD (&entries.list);
+	INIT_LIST_HEAD (&old.list);
+
+	if (plus)
+		ret = syncop_readdirp (subvol, fd, 131072, glfd->offset,
+				       NULL, &entries);
+	else
+		ret = syncop_readdir (subvol, fd, 131072, glfd->offset,
+				      &entries);
+	if (ret >= 0) {
+		if (plus)
+			gf_link_inodes_from_dirent (THIS, fd->inode, &entries);
+
+		list_splice_init (&glfd->entries, &old.list);
+		list_splice_init (&entries.list, &glfd->entries);
+
+		/* spurious errno is dangerous for glfd_entry_next() */
+		errno = 0;
+	}
+
+	if (ret > 0)
+		glfd->next = list_entry (glfd->entries.next, gf_dirent_t, list);
+
+	gf_dirent_free (&old);
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+gf_dirent_t *
+glfd_entry_next (struct glfs_fd *glfd, int plus)
+{
+	gf_dirent_t     *entry = NULL;
+	int              ret = -1;
+
+	if (!glfd->offset || !glfd->next) {
+		ret = glfd_entry_refresh (glfd, plus);
+		if (ret < 0)
+			return NULL;
+	}
+
+	entry = glfd->next;
+	if (!entry)
+		return NULL;
+
+	if (&entry->next->list == &glfd->entries)
+		glfd->next = NULL;
+	else
+		glfd->next = entry->next;
+
+	glfd->offset = entry->d_off;
+
+	return entry;
+}
+
+
+static struct dirent *
+glfs_readdirbuf_get (struct glfs_fd *glfd)
+{
+        struct dirent *buf = NULL;
+
+        LOCK (&glfd->fd->lock);
+        {
+                buf = glfd->readdirbuf;
+                if (buf) {
+                        memset (buf, 0, READDIRBUF_SIZE);
+                        goto unlock;
+                }
+
+                buf = GF_CALLOC (1, READDIRBUF_SIZE, glfs_mt_readdirbuf_t);
+                if (!buf) {
+                        errno = ENOMEM;
+                        goto unlock;
+                }
+
+                glfd->readdirbuf = buf;
+        }
+unlock:
+        UNLOCK (&glfd->fd->lock);
+
+        return buf;
+}
+
+
+int
+glfs_readdirplus_r (struct glfs_fd *glfd, struct stat *stat, struct dirent *ext,
+		    struct dirent **res)
+{
+	int              ret = 0;
+	gf_dirent_t     *entry = NULL;
+	struct dirent   *buf = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	errno = 0;
+
+	if (ext)
+		buf = ext;
+	else
+		buf = glfs_readdirbuf_get (glfd);
+
+	if (!buf) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	entry = glfd_entry_next (glfd, !!stat);
+	if (errno)
+		ret = -1;
+
+	if (res) {
+		if (entry)
+			*res = buf;
+		else
+			*res = NULL;
+	}
+
+	if (entry) {
+		gf_dirent_to_dirent (entry, buf);
+		if (stat)
+			glfs_iatt_to_stat (glfd->fs, &entry->d_stat, stat);
+	}
+
+	return ret;
+}
+
+
+int
+glfs_readdir_r (struct glfs_fd *glfd, struct dirent *buf, struct dirent **res)
+{
+	return glfs_readdirplus_r (glfd, 0, buf, res);
+}
+
+
+struct dirent *
+glfs_readdirplus (struct glfs_fd *glfd, struct stat *stat)
+{
+        struct dirent *res = NULL;
+        int ret = -1;
+
+        ret = glfs_readdirplus_r (glfd, stat, NULL, &res);
+        if (ret)
+                return NULL;
+
+        return res;
+}
+
+
+
+struct dirent *
+glfs_readdir (struct glfs_fd *glfd)
+{
+        return glfs_readdirplus (glfd, NULL);
+}
+
+
+int
+glfs_statvfs (struct glfs *fs, const char *path, struct statvfs *buf)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	ret = syncop_statfs (subvol, &loc, buf);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_setattr (struct glfs *fs, const char *path, struct iatt *iatt,
+	      int valid, int follow)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      riatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	if (follow)
+		ret = glfs_resolve (fs, subvol, path, &loc, &riatt, reval);
+	else
+		ret = glfs_lresolve (fs, subvol, path, &loc, &riatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	ret = syncop_setattr (subvol, &loc, iatt, valid, 0, 0);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_fsetattr (struct glfs_fd *glfd, struct iatt *iatt, int valid)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	fd_t            *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	ret = syncop_fsetattr (subvol, fd, iatt, valid, 0, 0);
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_chmod (struct glfs *fs, const char *path, mode_t mode)
+{
+	int              ret = -1;
+	struct iatt      iatt = {0, };
+	int              valid = 0;
+
+	iatt.ia_prot = ia_prot_from_st_mode (mode);
+	valid = GF_SET_ATTR_MODE;
+
+	ret = glfs_setattr (fs, path, &iatt, valid, 1);
+
+	return ret;
+}
+
+
+int
+glfs_fchmod (struct glfs_fd *glfd, mode_t mode)
+{
+	int              ret = -1;
+	struct iatt      iatt = {0, };
+	int              valid = 0;
+
+	iatt.ia_prot = ia_prot_from_st_mode (mode);
+	valid = GF_SET_ATTR_MODE;
+
+	ret = glfs_fsetattr (glfd, &iatt, valid);
+
+	return ret;
+}
+
+
+int
+glfs_chown (struct glfs *fs, const char *path, uid_t uid, gid_t gid)
+{
+	int              ret = -1;
+	int              valid = 0;
+	struct iatt      iatt = {0, };
+
+	iatt.ia_uid = uid;
+	iatt.ia_gid = gid;
+	valid = GF_SET_ATTR_UID|GF_SET_ATTR_GID;
+
+	ret = glfs_setattr (fs, path, &iatt, valid, 1);
+
+	return ret;
+}
+
+
+int
+glfs_lchown (struct glfs *fs, const char *path, uid_t uid, gid_t gid)
+{
+	int              ret = -1;
+	int              valid = 0;
+	struct iatt      iatt = {0, };
+
+	iatt.ia_uid = uid;
+	iatt.ia_gid = gid;
+	valid = GF_SET_ATTR_UID|GF_SET_ATTR_GID;
+
+	ret = glfs_setattr (fs, path, &iatt, valid, 0);
+
+	return ret;
+}
+
+
+int
+glfs_fchown (struct glfs_fd *glfd, uid_t uid, gid_t gid)
+{
+	int              ret = -1;
+	int              valid = 0;
+	struct iatt      iatt = {0, };
+
+	iatt.ia_uid = uid;
+	iatt.ia_gid = gid;
+	valid = GF_SET_ATTR_UID|GF_SET_ATTR_GID;
+
+	ret = glfs_fsetattr (glfd, &iatt, valid);
+
+	return ret;
+}
+
+
+int
+glfs_utimens (struct glfs *fs, const char *path, struct timespec times[2])
+{
+	int              ret = -1;
+	int              valid = 0;
+	struct iatt      iatt = {0, };
+
+	iatt.ia_atime = times[0].tv_sec;
+	iatt.ia_atime_nsec = times[0].tv_nsec;
+	iatt.ia_mtime = times[1].tv_sec;
+	iatt.ia_mtime_nsec = times[1].tv_nsec;
+
+	valid = GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME;
+
+	ret = glfs_setattr (fs, path, &iatt, valid, 1);
+
+	return ret;
+}
+
+
+int
+glfs_lutimens (struct glfs *fs, const char *path, struct timespec times[2])
+{
+	int              ret = -1;
+	int              valid = 0;
+	struct iatt      iatt = {0, };
+
+	iatt.ia_atime = times[0].tv_sec;
+	iatt.ia_atime_nsec = times[0].tv_nsec;
+	iatt.ia_mtime = times[1].tv_sec;
+	iatt.ia_mtime_nsec = times[1].tv_nsec;
+
+	valid = GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME;
+
+	ret = glfs_setattr (fs, path, &iatt, valid, 0);
+
+	return ret;
+}
+
+
+int
+glfs_futimens (struct glfs_fd *glfd, struct timespec times[2])
+{
+	int              ret = -1;
+	int              valid = 0;
+	struct iatt      iatt = {0, };
+
+	iatt.ia_atime = times[0].tv_sec;
+	iatt.ia_atime_nsec = times[0].tv_nsec;
+	iatt.ia_mtime = times[1].tv_sec;
+	iatt.ia_mtime_nsec = times[1].tv_nsec;
+
+	valid = GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME;
+
+	ret = glfs_fsetattr (glfd, &iatt, valid);
+
+	return ret;
+}
+
+
+int
+glfs_getxattr_process (void *value, size_t size, dict_t *xattr,
+		       const char *name)
+{
+	data_t *data = NULL;
+	int     ret = -1;
+
+	data = dict_get (xattr, (char *)name);
+	if (!data) {
+		errno = ENODATA;
+		ret = -1;
+		goto out;
+	}
+
+	ret = data->len;
+	if (!value || !size)
+		goto out;
+
+	if (size < ret) {
+		ret = -1;
+		errno = ERANGE;
+		goto out;
+	}
+
+	memcpy (value, data->data, ret);
+out:
+	if (xattr)
+		dict_unref (xattr);
+	return ret;
+}
+
+
+ssize_t
+glfs_getxattr_common (struct glfs *fs, const char *path, const char *name,
+		      void *value, size_t size, int follow)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	dict_t          *xattr = NULL;
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	if (follow)
+		ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+	else
+		ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	ret = syncop_getxattr (subvol, &loc, &xattr, name);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	ret = glfs_getxattr_process (value, size, xattr, name);
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+ssize_t
+glfs_getxattr (struct glfs *fs, const char *path, const char *name,
+	       void *value, size_t size)
+{
+	return glfs_getxattr_common (fs, path, name, value, size, 1);
+}
+
+
+ssize_t
+glfs_lgetxattr (struct glfs *fs, const char *path, const char *name,
+		void *value, size_t size)
+{
+	return glfs_getxattr_common (fs, path, name, value, size, 0);
+}
+
+
+ssize_t
+glfs_fgetxattr (struct glfs_fd *glfd, const char *name, void *value,
+		size_t size)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	dict_t          *xattr = NULL;
+	fd_t            *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	ret = syncop_fgetxattr (subvol, fd, &xattr, name);
+	if (ret)
+		goto out;
+
+	ret = glfs_getxattr_process (value, size, xattr, name);
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_listxattr_process (void *value, size_t size, dict_t *xattr)
+{
+	int     ret = -1;
+
+	ret = dict_keys_join (NULL, 0, xattr, NULL);
+
+	if (!value || !size)
+		goto out;
+
+	if (size < ret) {
+		ret = -1;
+		errno = ERANGE;
+		goto out;
+	}
+
+	dict_keys_join (value, size, xattr, NULL);
+out:
+	if (xattr)
+		dict_unref (xattr);
+	return ret;
+}
+
+
+ssize_t
+glfs_listxattr_common (struct glfs *fs, const char *path, void *value,
+		       size_t size, int follow)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	dict_t          *xattr = NULL;
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+retry:
+	if (follow)
+		ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+	else
+		ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	ret = syncop_getxattr (subvol, &loc, &xattr, NULL);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	ret = glfs_listxattr_process (value, size, xattr);
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+ssize_t
+glfs_listxattr (struct glfs *fs, const char *path, void *value, size_t size)
+{
+	return glfs_listxattr_common (fs, path, value, size, 1);
+}
+
+
+ssize_t
+glfs_llistxattr (struct glfs *fs, const char *path, void *value, size_t size)
+{
+	return glfs_listxattr_common (fs, path, value, size, 0);
+}
+
+
+ssize_t
+glfs_flistxattr (struct glfs_fd *glfd, void *value, size_t size)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	dict_t          *xattr = NULL;
+	fd_t            *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	ret = syncop_fgetxattr (subvol, fd, &xattr, NULL);
+	if (ret)
+		goto out;
+
+	ret = glfs_listxattr_process (value, size, xattr);
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+dict_t *
+dict_for_key_value (const char *name, const char *value, size_t size)
+{
+	dict_t *xattr = NULL;
+	int     ret = 0;
+
+	xattr = dict_new ();
+	if (!xattr)
+		return NULL;
+
+	ret = dict_set_static_bin (xattr, (char *)name, (void *)value, size);
+	if (ret) {
+		dict_destroy (xattr);
+		xattr = NULL;
+	}
+
+	return xattr;
+}
+
+
+int
+glfs_setxattr_common (struct glfs *fs, const char *path, const char *name,
+		      const void *value, size_t size, int flags, int follow)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	dict_t          *xattr = NULL;
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	if (follow)
+		ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+	else
+		ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	xattr = dict_for_key_value (name, value, size);
+	if (!xattr) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	ret = syncop_setxattr (subvol, &loc, xattr, flags);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+out:
+	loc_wipe (&loc);
+	if (xattr)
+		dict_unref (xattr);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_setxattr (struct glfs *fs, const char *path, const char *name,
+	       const void *value, size_t size, int flags)
+{
+	return glfs_setxattr_common (fs, path, name, value, size, flags, 1);
+}
+
+
+int
+glfs_lsetxattr (struct glfs *fs, const char *path, const char *name,
+		const void *value, size_t size, int flags)
+{
+	return glfs_setxattr_common (fs, path, name, value, size, flags, 0);
+}
+
+
+int
+glfs_fsetxattr (struct glfs_fd *glfd, const char *name, const void *value,
+		size_t size, int flags)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	dict_t          *xattr = NULL;
+	fd_t            *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	xattr = dict_for_key_value (name, value, size);
+	if (!xattr) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	ret = syncop_fsetxattr (subvol, fd, xattr, flags);
+out:
+	if (xattr)
+		dict_unref (xattr);
+
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_removexattr_common (struct glfs *fs, const char *path, const char *name,
+			 int follow)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	if (follow)
+		ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+	else
+		ret = glfs_lresolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	ret = syncop_removexattr (subvol, &loc, name);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_removexattr (struct glfs *fs, const char *path, const char *name)
+{
+	return glfs_removexattr_common (fs, path, name, 1);
+}
+
+
+int
+glfs_lremovexattr (struct glfs *fs, const char *path, const char *name)
+{
+	return glfs_removexattr_common (fs, path, name, 0);
+}
+
+
+int
+glfs_fremovexattr (struct glfs_fd *glfd, const char *name)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	fd_t            *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	ret = syncop_fremovexattr (subvol, fd, name);
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_fallocate (struct glfs_fd *glfd, int keep_size, off_t offset, size_t len)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	fd_t		*fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	ret = syncop_fallocate (subvol, fd, keep_size, offset, len);
+out:
+	if (fd)
+		fd_unref(fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_discard (struct glfs_fd *glfd, off_t offset, size_t len)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	fd_t		*fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	ret = syncop_discard (subvol, fd, offset, len);
+out:
+	if (fd)
+		fd_unref(fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+int
+glfs_zerofill (struct glfs_fd *glfd, off_t offset, size_t len)
+{
+        int               ret             = -1;
+        xlator_t         *subvol          = NULL;
+        fd_t             *fd              = NULL;
+
+        __glfs_entry_fd (glfd);
+
+        subvol = glfs_active_subvol (glfd->fs);
+        if (!subvol) {
+                errno = EIO;
+                goto out;
+        }
+
+        fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+        if (!fd) {
+                errno = EBADFD;
+                goto out;
+        }
+
+        ret = syncop_zerofill (subvol, fd, offset, len);
+out:
+        if (fd)
+                fd_unref(fd);
+
+        glfs_subvol_done (glfd->fs, subvol);
+
+        return ret;
+}
+
+int
+glfs_chdir (struct glfs *fs, const char *path)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	if (!IA_ISDIR (iatt.ia_type)) {
+		ret = -1;
+		errno = ENOTDIR;
+		goto out;
+	}
+
+	glfs_cwd_set (fs, loc.inode);
+
+out:
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+
+int
+glfs_fchdir (struct glfs_fd *glfd)
+{
+	int       ret = -1;
+	inode_t  *inode = NULL;
+	xlator_t *subvol = NULL;
+	fd_t     *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	inode = fd->inode;
+
+	if (!IA_ISDIR (inode->ia_type)) {
+		ret = -1;
+		errno = ENOTDIR;
+		goto out;
+	}
+
+	glfs_cwd_set (glfd->fs, inode);
+	ret = 0;
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+char *
+glfs_realpath (struct glfs *fs, const char *path, char *resolved_path)
+{
+	int              ret = -1;
+	char            *retpath = NULL;
+	char            *allocpath = NULL;
+	xlator_t        *subvol = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              reval = 0;
+
+	__glfs_entry_fs (fs);
+
+	if (resolved_path)
+		retpath = resolved_path;
+	else
+		retpath = allocpath = malloc (PATH_MAX + 1);
+
+	if (!retpath) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+retry:
+	ret = glfs_resolve (fs, subvol, path, &loc, &iatt, reval);
+
+	ESTALE_RETRY (ret, errno, reval, &loc, retry);
+
+	if (ret)
+		goto out;
+
+	if (loc.path) {
+		strncpy (retpath, loc.path, PATH_MAX);
+		retpath[PATH_MAX] = 0;
+	}
+
+out:
+	loc_wipe (&loc);
+
+	if (ret == -1) {
+		if (allocpath)
+			free (allocpath);
+		retpath = NULL;
+	}
+
+	glfs_subvol_done (fs, subvol);
+
+	return retpath;
+}
+
+
+char *
+glfs_getcwd (struct glfs *fs, char *buf, size_t n)
+{
+	int              ret = -1;
+	inode_t         *inode = NULL;
+	char            *path = NULL;
+
+	__glfs_entry_fs (fs);
+
+	if (!buf || n < 2) {
+		ret = -1;
+		errno = EINVAL;
+		goto out;
+	}
+
+	inode = glfs_cwd_get (fs);
+
+	if (!inode) {
+		strncpy (buf, "/", n);
+		ret = 0;
+		goto out;
+	}
+
+	ret = inode_path (inode, 0, &path);
+	if (n <= ret) {
+		ret = -1;
+		errno = ERANGE;
+		goto out;
+	}
+
+	strncpy (buf, path, n);
+	ret = 0;
+out:
+	GF_FREE (path);
+
+	if (inode)
+		inode_unref (inode);
+
+	if (ret < 0)
+		return NULL;
+
+	return buf;
+}
+
+
+static void
+gf_flock_to_flock (struct gf_flock *gf_flock, struct flock *flock)
+{
+	flock->l_type   = gf_flock->l_type;
+	flock->l_whence = gf_flock->l_whence;
+	flock->l_start  = gf_flock->l_start;
+	flock->l_len    = gf_flock->l_len;
+	flock->l_pid    = gf_flock->l_pid;
+}
+
+
+static void
+gf_flock_from_flock (struct gf_flock *gf_flock, struct flock *flock)
+{
+	gf_flock->l_type   = flock->l_type;
+	gf_flock->l_whence = flock->l_whence;
+	gf_flock->l_start  = flock->l_start;
+	gf_flock->l_len    = flock->l_len;
+	gf_flock->l_pid    = flock->l_pid;
+}
+
+
+int
+glfs_posix_lock (struct glfs_fd *glfd, int cmd, struct flock *flock)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	struct gf_flock  gf_flock = {0, };
+	struct gf_flock  saved_flock = {0, };
+	fd_t            *fd = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	subvol = glfs_active_subvol (glfd->fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (glfd->fs, subvol, glfd);
+	if (!fd) {
+		ret = -1;
+		errno = EBADFD;
+		goto out;
+	}
+
+	gf_flock_from_flock (&gf_flock, flock);
+	gf_flock_from_flock (&saved_flock, flock);
+	ret = syncop_lk (subvol, fd, cmd, &gf_flock);
+	gf_flock_to_flock (&gf_flock, flock);
+
+	if (ret == 0 && (cmd == F_SETLK || cmd == F_SETLKW))
+		fd_lk_insert_and_merge (fd, cmd, &saved_flock);
+out:
+	if (fd)
+		fd_unref (fd);
+
+	glfs_subvol_done (glfd->fs, subvol);
+
+	return ret;
+}
+
+
+struct glfs_fd *
+glfs_dup (struct glfs_fd *glfd)
+{
+	xlator_t  *subvol = NULL;
+	fd_t      *fd = NULL;
+	glfs_fd_t *dupfd = NULL;
+	struct glfs *fs = NULL;
+
+	__glfs_entry_fd (glfd);
+
+	fs = glfd->fs;
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		errno = EIO;
+		goto out;
+	}
+
+	fd = glfs_resolve_fd (fs, subvol, glfd);
+	if (!fd) {
+		errno = EBADFD;
+		goto out;
+	}
+
+	dupfd = glfs_fd_new (fs);
+	if (!dupfd) {
+		errno = ENOMEM;
+		goto out;
+	}
+
+	dupfd->fd = fd_ref (fd);
+out:
+	if (fd)
+		fd_unref (fd);
+	if (dupfd)
+		glfs_fd_bind (dupfd);
+
+	glfs_subvol_done (fs, subvol);
+
+	return dupfd;
+}
diff --git a/api/src/glfs-handleops.c b/api/src/glfs-handleops.c
new file mode 100644
index 000000000..9c707a619
--- /dev/null
+++ b/api/src/glfs-handleops.c
@@ -0,0 +1,1278 @@
+/*
+ *  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ *  This file is part of GlusterFS.
+ *
+ *  This file is licensed to you under your choice of the GNU Lesser
+ *  General Public License, version 3 or any later version (LGPLv3 or
+ *  later), or the GNU General Public License, version 2 (GPLv2), in all
+ *  cases as published by the Free Software Foundation.
+ */
+
+
+#include "glfs-internal.h"
+#include "glfs-mem-types.h"
+#include "syncop.h"
+#include "glfs.h"
+#include "glfs-handles.h"
+
+static void
+glfs_iatt_from_stat (struct stat *stat, int valid, struct iatt *iatt,
+		     int *glvalid)
+{
+	/* validate in args */
+	if ((stat == NULL) || (iatt == NULL) || (glvalid == NULL)) {
+		errno = EINVAL;
+		return;
+	}
+
+	*glvalid = 0;
+
+	if (valid & GFAPI_SET_ATTR_MODE) {
+		iatt->ia_prot = ia_prot_from_st_mode (stat->st_mode);
+		*glvalid |= GF_SET_ATTR_MODE;
+	}
+
+	if (valid & GFAPI_SET_ATTR_UID) {
+		iatt->ia_uid = stat->st_uid;
+		*glvalid |= GF_SET_ATTR_UID;
+	}
+
+	if (valid & GFAPI_SET_ATTR_GID) {
+		iatt->ia_gid = stat->st_gid;
+		*glvalid |= GF_SET_ATTR_GID;
+	}
+
+	if (valid & GFAPI_SET_ATTR_ATIME) {
+		iatt->ia_atime = stat->st_atime;
+		iatt->ia_atime_nsec = ST_ATIM_NSEC (stat);
+		*glvalid |= GF_SET_ATTR_ATIME;
+	}
+
+	if (valid & GFAPI_SET_ATTR_MTIME) {
+		iatt->ia_mtime = stat->st_mtime;
+		iatt->ia_mtime_nsec = ST_MTIM_NSEC (stat);
+		*glvalid |= GF_SET_ATTR_MTIME;
+	}
+
+	return;
+}
+
+struct glfs_object *
+glfs_h_lookupat (struct glfs *fs, struct glfs_object *parent,
+		 const char *path, struct stat *stat)
+{
+	int                      ret = 0;
+	xlator_t                *subvol = NULL;
+	inode_t                 *inode = NULL;
+	struct iatt              iatt = {0, };
+	struct glfs_object      *object = NULL;
+	loc_t                    loc = {0, };
+
+	/* validate in args */
+	if ((fs == NULL) || (path == NULL)) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	if (parent) {
+		inode = glfs_resolve_inode (fs, subvol, parent);
+		if (!inode) {
+			errno = ESTALE;
+			goto out;
+		}
+	}
+
+	/* fop/op */
+	ret = glfs_resolve_at (fs, subvol, inode, path, &loc, &iatt,
+			       0 /*TODO: links? */, 0);
+
+	/* populate out args */
+	if (!ret) {
+		if (stat)
+			glfs_iatt_to_stat (fs, &iatt, stat);
+
+		ret = glfs_create_object (&loc, &object);
+	}
+
+out:
+	loc_wipe (&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	glfs_subvol_done (fs, subvol);
+
+	return object;
+}
+
+int
+glfs_h_stat (struct glfs *fs, struct glfs_object *object, struct stat *stat)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	inode_t         *inode = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+
+	/* validate in args */
+	if ((fs == NULL) || (object == NULL)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, object);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	/* populate loc */
+	GLFS_LOC_FILL_INODE (inode, loc, out);
+
+	/* fop/op */
+	ret = syncop_stat (subvol, &loc, &iatt);
+
+	/* populate out args */
+	if (!ret && stat) {
+		glfs_iatt_to_stat (fs, &iatt, stat);
+	}
+out:
+	loc_wipe (&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+int
+glfs_h_getattrs (struct glfs *fs, struct glfs_object *object, struct stat *stat)
+{
+	int                      ret = 0;
+	xlator_t                *subvol = NULL;
+	inode_t                 *inode = NULL;
+	struct iatt              iatt = {0, };
+
+	/* validate in args */
+	if ((fs == NULL) || (object == NULL)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, object);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	/* fop/op */
+	ret = glfs_resolve_base (fs, subvol, inode, &iatt);
+
+	/* populate out args */
+	if (!ret && stat) {
+		glfs_iatt_to_stat (fs, &iatt, stat);
+	}
+
+out:
+	if (inode)
+		inode_unref (inode);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+int
+glfs_h_setattrs (struct glfs *fs, struct glfs_object *object, struct stat *stat,
+		 int valid)
+{
+	int              ret = -1;
+	xlator_t        *subvol = NULL;
+	inode_t         *inode = NULL;
+	loc_t            loc = {0, };
+	struct iatt      iatt = {0, };
+	int              glvalid = 0;
+
+	/* validate in args */
+	if ((fs == NULL) || (object == NULL) || (stat == NULL)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, object);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	/* map valid masks from in args */
+	glfs_iatt_from_stat (stat, valid, &iatt, &glvalid);
+
+	/* populate loc */
+	GLFS_LOC_FILL_INODE (inode, loc, out);
+
+	/* fop/op */
+	ret = syncop_setattr (subvol, &loc, &iatt, glvalid, 0, 0);
+out:
+	loc_wipe (&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+struct glfs_fd *
+glfs_h_open (struct glfs *fs, struct glfs_object *object, int flags)
+{
+	int              ret = -1;
+	struct glfs_fd  *glfd = NULL;
+	xlator_t        *subvol = NULL;
+	inode_t         *inode = NULL;
+	loc_t            loc = {0, };
+
+	/* validate in args */
+	if ((fs == NULL) || (object == NULL)) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, object);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	/* check types to open */
+	if (IA_ISDIR (inode->ia_type)) {
+		ret = -1;
+		errno = EISDIR;
+		goto out;
+	}
+
+	if (!IA_ISREG (inode->ia_type)) {
+		ret = -1;
+		errno = EINVAL;
+		goto out;
+	}
+
+	glfd = glfs_fd_new (fs);
+	if (!glfd) {
+		errno = ENOMEM;
+		goto out;
+	}
+
+	glfd->fd = fd_create (inode, getpid());
+	if (!glfd->fd) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	/* populate loc */
+	GLFS_LOC_FILL_INODE (inode, loc, out);
+
+	/* fop/op */
+	ret = syncop_open (subvol, &loc, flags, glfd->fd);
+
+out:
+	loc_wipe (&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	if (ret && glfd) {
+		glfs_fd_destroy (glfd);
+		glfd = NULL;
+	} else {
+		glfd->fd->flags = flags;
+		fd_bind (glfd->fd);
+		glfs_fd_bind (glfd);
+	}
+
+	glfs_subvol_done (fs, subvol);
+
+	return glfd;
+}
+
+struct glfs_object *
+glfs_h_creat (struct glfs *fs, struct glfs_object *parent, const char *path,
+	      int flags, mode_t mode, struct stat *stat)
+{
+	int                 ret = -1;
+	struct glfs_fd     *glfd = NULL;
+	xlator_t           *subvol = NULL;
+	inode_t            *inode = NULL;
+	loc_t               loc = {0, };
+	struct iatt         iatt = {0, };
+	uuid_t              gfid;
+	dict_t             *xattr_req = NULL;
+	struct glfs_object *object = NULL;
+
+	/* validate in args */
+	if ((fs == NULL) || (parent == NULL) || (path == NULL)) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, parent);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	xattr_req = dict_new ();
+	if (!xattr_req) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	uuid_generate (gfid);
+	ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+	if (ret) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	GLFS_LOC_FILL_PINODE (inode, loc, ret, errno, out, path);
+
+	glfd = glfs_fd_new (fs);
+	if (!glfd)
+		goto out;
+
+	glfd->fd = fd_create (loc.inode, getpid());
+	if (!glfd->fd) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	/* fop/op */
+	ret = syncop_create (subvol, &loc, flags, mode, glfd->fd,
+			     xattr_req, &iatt);
+
+	/* populate out args */
+	if (ret == 0) {
+		/* TODO: If the inode existed in the cache (say file already
+		   exists), then the glfs_loc_link will not update the
+		   loc.inode, as a result we will have a 0000 GFID that we
+		   would copy out to the object, this needs to be fixed.
+		*/
+		ret = glfs_loc_link (&loc, &iatt);
+		if (ret != 0) {
+			goto out;
+		}
+
+		if (stat)
+			glfs_iatt_to_stat (fs, &iatt, stat);
+
+		ret = glfs_create_object (&loc, &object);
+	}
+
+out:
+	if (ret && object != NULL) {
+		glfs_h_close (object);
+		object = NULL;
+	}
+
+	loc_wipe(&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	if (xattr_req)
+		dict_unref (xattr_req);
+
+	if (glfd) {
+		glfs_fd_destroy (glfd);
+		glfd = NULL;
+	}
+
+	glfs_subvol_done (fs, subvol);
+
+	return object;
+}
+
+struct glfs_object *
+glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent, const char *path,
+	      mode_t mode, struct stat *stat)
+{
+	int                 ret = -1;
+	xlator_t           *subvol = NULL;
+	inode_t            *inode = NULL;
+	loc_t               loc = {0, };
+	struct iatt         iatt = {0, };
+	uuid_t              gfid;
+	dict_t             *xattr_req = NULL;
+	struct glfs_object *object = NULL;
+
+	/* validate in args */
+	if ((fs == NULL) || (parent == NULL) || (path == NULL)) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, parent);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	xattr_req = dict_new ();
+	if (!xattr_req) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	uuid_generate (gfid);
+	ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+	if (ret) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	GLFS_LOC_FILL_PINODE (inode, loc, ret, errno, out, path);
+
+	/* fop/op */
+	ret = syncop_mkdir (subvol, &loc, mode, xattr_req, &iatt);
+
+	/* populate out args */
+	if ( ret == 0 )  {
+		ret = glfs_loc_link (&loc, &iatt);
+		if (ret != 0) {
+			goto out;
+		}
+
+		if (stat)
+			glfs_iatt_to_stat (fs, &iatt, stat);
+
+		ret = glfs_create_object (&loc, &object);
+	}
+
+out:
+	if (ret && object != NULL) {
+		glfs_h_close (object);
+		object = NULL;
+	}
+
+	loc_wipe(&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	if (xattr_req)
+		dict_unref (xattr_req);
+
+	glfs_subvol_done (fs, subvol);
+
+	return object;
+}
+
+struct glfs_object *
+glfs_h_mknod (struct glfs *fs, struct glfs_object *parent, const char *path,
+	      mode_t mode, dev_t dev, struct stat *stat)
+{
+	int                 ret = -1;
+	xlator_t           *subvol = NULL;
+	inode_t            *inode = NULL;
+	loc_t               loc = {0, };
+	struct iatt         iatt = {0, };
+	uuid_t              gfid;
+	dict_t             *xattr_req = NULL;
+	struct glfs_object *object = NULL;
+
+	/* validate in args */
+	if ((fs == NULL) || (parent == NULL) || (path == NULL)) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, parent);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	xattr_req = dict_new ();
+	if (!xattr_req) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	uuid_generate (gfid);
+	ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+	if (ret) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	GLFS_LOC_FILL_PINODE (inode, loc, ret, errno, out, path);
+
+	/* fop/op */
+	ret = syncop_mknod (subvol, &loc, mode, dev, xattr_req, &iatt);
+
+	/* populate out args */
+	if (ret == 0) {
+		ret = glfs_loc_link (&loc, &iatt);
+		if (ret != 0) {
+			goto out;
+		}
+
+		if (stat)
+			glfs_iatt_to_stat (fs, &iatt, stat);
+
+		ret = glfs_create_object (&loc, &object);
+	}
+out:
+	if (ret && object != NULL) {
+		glfs_h_close (object);
+		object = NULL;
+	}
+
+	loc_wipe(&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	if (xattr_req)
+		dict_unref (xattr_req);
+
+	glfs_subvol_done (fs, subvol);
+
+	return object;
+}
+
+int
+glfs_h_unlink (struct glfs *fs, struct glfs_object *parent, const char *path)
+{
+	int                 ret = -1;
+	xlator_t           *subvol = NULL;
+	inode_t            *inode = NULL;
+	loc_t               loc = {0, };
+
+	/* validate in args */
+	if ((fs == NULL) || (parent == NULL) || (path == NULL)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if ( !subvol ) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, parent);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	ret = glfs_resolve_at (fs, subvol, inode, path, &loc, NULL, 0 , 0);
+	if (ret != 0) {
+		goto out;
+	}
+
+	if (!IA_ISDIR(loc.inode->ia_type)) {
+		ret = syncop_unlink (subvol, &loc);
+		if (ret != 0) {
+			goto out;
+		}
+	} else {
+		ret = syncop_rmdir (subvol, &loc);
+		if (ret != 0) {
+			goto out;
+		}
+	}
+
+	if (ret == 0)
+		ret = glfs_loc_unlink (&loc);
+
+out:
+	loc_wipe (&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+struct glfs_fd *
+glfs_h_opendir (struct glfs *fs, struct glfs_object *object)
+{
+	int              ret = -1;
+	struct glfs_fd  *glfd = NULL;
+	xlator_t        *subvol = NULL;
+	inode_t         *inode = NULL;
+	loc_t            loc = {0, };
+
+	/* validate in args */
+	if ((fs == NULL) || (object == NULL)) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, object);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	if (!IA_ISDIR (inode->ia_type)) {
+		ret = -1;
+		errno = ENOTDIR;
+		goto out;
+	}
+
+	glfd = glfs_fd_new (fs);
+	if (!glfd)
+		goto out;
+
+	INIT_LIST_HEAD (&glfd->entries);
+
+	glfd->fd = fd_create (inode, getpid());
+	if (!glfd->fd) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	GLFS_LOC_FILL_INODE (inode, loc, out);
+
+	/* fop/op */
+	ret = syncop_opendir (subvol, &loc, glfd->fd);
+
+out:
+	loc_wipe (&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	if (ret && glfd) {
+		glfs_fd_destroy (glfd);
+		glfd = NULL;
+	} else {
+		fd_bind (glfd->fd);
+		glfs_fd_bind (glfd);
+	}
+
+	glfs_subvol_done (fs, subvol);
+
+	return glfd;
+}
+
+ssize_t
+glfs_h_extract_handle (struct glfs_object *object, unsigned char *handle,
+		       int len)
+{
+	ssize_t ret = -1;
+
+	/* validate in args */
+	if (object == NULL) {
+		errno = EINVAL;
+		goto out;
+	}
+
+	if (!handle || !len) {
+		ret = GFAPI_HANDLE_LENGTH;
+		goto out;
+	}
+
+	if (len < GFAPI_HANDLE_LENGTH)
+	{
+		errno = ERANGE;
+		goto out;
+	}
+
+	memcpy (handle, object->gfid, GFAPI_HANDLE_LENGTH);
+
+	ret = GFAPI_HANDLE_LENGTH;
+
+out:
+	return ret;
+}
+
+struct glfs_object *
+glfs_h_create_from_handle (struct glfs *fs, unsigned char *handle, int len,
+			   struct stat *stat)
+{
+	loc_t               loc = {0, };
+	int                 ret = -1;
+	struct iatt         iatt = {0, };
+	inode_t            *newinode = NULL;
+	xlator_t           *subvol = NULL;
+	struct glfs_object *object = NULL;
+
+	/* validate in args */
+	if ((fs == NULL) || (handle == NULL) || (len != GFAPI_HANDLE_LENGTH)) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		errno = EIO;
+		goto out;
+	}
+
+	memcpy (loc.gfid, handle, GFAPI_HANDLE_LENGTH);
+
+	newinode = inode_find (subvol->itable, loc.gfid);
+	if (newinode)
+		loc.inode = newinode;
+	else {
+		loc.inode = inode_new (subvol->itable);
+		if (!loc.inode) {
+			errno = ENOMEM;
+			goto out;
+		}
+	}
+
+	ret = syncop_lookup (subvol, &loc, 0, &iatt, 0, 0);
+	if (ret) {
+		gf_log (subvol->name, GF_LOG_WARNING,
+			"inode refresh of %s failed: %s",
+			uuid_utoa (loc.gfid), strerror (errno));
+		goto out;
+	}
+
+	newinode = inode_link (loc.inode, 0, 0, &iatt);
+	if (newinode)
+		inode_lookup (newinode);
+	else {
+		gf_log (subvol->name, GF_LOG_WARNING,
+			"inode linking of %s failed: %s",
+			uuid_utoa (loc.gfid), strerror (errno));
+		errno = EINVAL;
+		goto out;
+	}
+
+	/* populate stat */
+	if (stat)
+		glfs_iatt_to_stat (fs, &iatt, stat);
+
+	object = GF_CALLOC (1, sizeof(struct glfs_object),
+			    glfs_mt_glfs_object_t);
+	if (object == NULL) {
+		errno = ENOMEM;
+		ret = -1;
+		goto out;
+	}
+
+	/* populate the return object */
+	object->inode = newinode;
+	uuid_copy (object->gfid, object->inode->gfid);
+
+out:
+	/* TODO: Check where the inode ref is being held? */
+	loc_wipe (&loc);
+
+	glfs_subvol_done (fs, subvol);
+
+	return object;
+}
+
+int
+glfs_h_close (struct glfs_object *object)
+{
+	/* Release the held reference */
+	inode_unref (object->inode);
+	GF_FREE (object);
+
+	return 0;
+}
+
+int
+glfs_h_truncate (struct glfs *fs, struct glfs_object *object, off_t offset)
+{
+	loc_t               loc = {0, };
+	int                 ret = -1;
+	xlator_t           *subvol = NULL;
+	inode_t            *inode = NULL;
+
+	/* validate in args */
+	if ((fs == NULL) || (object == NULL)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, object);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	GLFS_LOC_FILL_INODE (inode, loc, out);
+
+	/* fop/op */
+	ret = syncop_truncate (subvol, &loc, (off_t)offset);
+
+	/* populate out args */
+	if (ret == 0)
+		ret = glfs_loc_unlink (&loc);
+
+out:
+	loc_wipe (&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+struct glfs_object *
+glfs_h_symlink (struct glfs *fs, struct glfs_object *parent, const char *name,
+		const char *data, struct stat *stat)
+{
+	int                 ret = -1;
+	xlator_t           *subvol = NULL;
+	inode_t            *inode = NULL;
+	loc_t               loc = {0, };
+	struct iatt         iatt = {0, };
+	uuid_t              gfid;
+	dict_t             *xattr_req = NULL;
+	struct glfs_object *object = NULL;
+
+	/* validate in args */
+	if ((fs == NULL) || (parent == NULL) || (name == NULL) ||
+		(data == NULL)) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, parent);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	xattr_req = dict_new ();
+	if (!xattr_req) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	uuid_generate (gfid);
+	ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+	if (ret) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	GLFS_LOC_FILL_PINODE (inode, loc, ret, errno, out, name);
+
+	/* fop/op */
+	ret = syncop_symlink (subvol, &loc, data, xattr_req, &iatt);
+
+	/* populate out args */
+	if (ret == 0) {
+		/* TODO: If the inode existed in the cache (say file already
+		 * exists), then the glfs_loc_link will not update the
+		 * loc.inode, as a result we will have a 0000 GFID that we
+		 * would copy out to the object, this needs to be fixed.
+		 */
+		ret = glfs_loc_link (&loc, &iatt);
+		if (ret != 0) {
+			goto out;
+		}
+
+		if (stat)
+			glfs_iatt_to_stat (fs, &iatt, stat);
+
+		ret = glfs_create_object (&loc, &object);
+	}
+
+out:
+	if (ret && object != NULL) {
+		glfs_h_close (object);
+		object = NULL;
+	}
+
+	loc_wipe(&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	if (xattr_req)
+		dict_unref (xattr_req);
+
+	glfs_subvol_done (fs, subvol);
+
+	return object;
+}
+
+int
+glfs_h_readlink (struct glfs *fs, struct glfs_object *object, char *buf,
+		 size_t bufsiz)
+{
+	loc_t               loc = {0, };
+	int                 ret = -1;
+	xlator_t           *subvol = NULL;
+	inode_t            *inode = NULL;
+	char               *linkval = NULL;
+
+	/* validate in args */
+	if ((fs == NULL) || (object == NULL) || (buf == NULL)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, object);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	GLFS_LOC_FILL_INODE (inode, loc, out);
+
+	/* fop/op */
+	ret = syncop_readlink (subvol, &loc, &linkval, bufsiz);
+
+	/* populate out args */
+	if (ret > 0)
+		memcpy (buf, linkval, ret);
+
+out:
+	loc_wipe (&loc);
+
+	if (inode)
+		inode_unref (inode);
+
+	if (linkval)
+		GF_FREE (linkval);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+int
+glfs_h_link (struct glfs *fs, struct glfs_object *linksrc,
+	     struct glfs_object *parent, const char *name)
+{
+	int                 ret = -1;
+	xlator_t           *subvol = NULL;
+	inode_t            *inode = NULL;
+	inode_t            *pinode = NULL;
+	loc_t               oldloc = {0, };
+	loc_t               newloc = {0, };
+
+	/* validate in args */
+	if ((fs == NULL) || (linksrc == NULL) || (parent == NULL) ||
+		(name == NULL)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if (!subvol) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	inode = glfs_resolve_inode (fs, subvol, linksrc);
+	if (!inode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	if (inode->ia_type == IA_IFDIR) {
+		ret = -1;
+		errno = EISDIR;
+		goto out;
+	}
+
+	GLFS_LOC_FILL_INODE (inode, oldloc, out);
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	pinode = glfs_resolve_inode (fs, subvol, parent);
+	if (!pinode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	/* setup newloc based on parent */
+	newloc.parent = inode_ref (pinode);
+	newloc.name = name;
+	ret = glfs_loc_touchup (&newloc);
+	if (ret != 0) {
+		errno = EINVAL;
+		goto out;
+	}
+
+	/* Filling the inode of the hard link to be same as that of the
+	 * original file
+	 */
+	newloc.inode = inode_ref (inode);
+
+	/* fop/op */
+	ret = syncop_link (subvol, &oldloc, &newloc);
+
+	if (ret == 0)
+		/* TODO: No iatt to pass as there has been no lookup */
+		ret = glfs_loc_link (&newloc, NULL);
+out:
+	loc_wipe (&oldloc);
+	loc_wipe (&newloc);
+
+	if (inode)
+		inode_unref (inode);
+
+	if (pinode)
+		inode_unref (pinode);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
+
+int
+glfs_h_rename (struct glfs *fs, struct glfs_object *olddir, const char *oldname,
+	       struct glfs_object *newdir, const char *newname)
+{
+	int                 ret = -1;
+	xlator_t           *subvol = NULL;
+	inode_t            *oldpinode = NULL;
+	inode_t            *newpinode = NULL;
+	loc_t               oldloc = {0, };
+	loc_t               newloc = {0, };
+	struct iatt         oldiatt = {0, };
+	struct iatt         newiatt = {0, };
+
+	/* validate in args */
+	if ((fs == NULL) || (olddir == NULL) || (oldname == NULL) ||
+		(newdir == NULL) || (newname == NULL)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	__glfs_entry_fs (fs);
+
+	/* get the active volume */
+	subvol = glfs_active_subvol (fs);
+	if ( !subvol ) {
+		ret = -1;
+		errno = EIO;
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	oldpinode = glfs_resolve_inode (fs, subvol, olddir);
+	if (!oldpinode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	ret = glfs_resolve_at (fs, subvol, oldpinode, oldname, &oldloc,
+			       &oldiatt, 0 , 0);
+	if (ret != 0) {
+		goto out;
+	}
+
+	/* get/refresh the in arg objects inode in correlation to the xlator */
+	newpinode = glfs_resolve_inode (fs, subvol, newdir);
+	if (!newpinode) {
+		errno = ESTALE;
+		goto out;
+	}
+
+	ret = glfs_resolve_at (fs, subvol, newpinode, newname, &newloc,
+			       &newiatt, 0, 0);
+
+	if (ret && errno != ENOENT && newloc.parent)
+		goto out;
+
+	if (newiatt.ia_type != IA_INVAL) {
+		if ((oldiatt.ia_type == IA_IFDIR) !=
+			(newiatt.ia_type == IA_IFDIR)) {
+			/* Either both old and new must be dirs,
+			 * or both must be non-dirs. Else, fail.
+			 */
+			ret = -1;
+			errno = EISDIR;
+			goto out;
+		}
+	}
+
+	/* TODO: check if new or old is a prefix of the other, and fail EINVAL */
+
+	ret = syncop_rename (subvol, &oldloc, &newloc);
+
+	if (ret == 0)
+		inode_rename (oldloc.parent->table, oldloc.parent, oldloc.name,
+			      newloc.parent, newloc.name, oldloc.inode,
+			      &oldiatt);
+
+out:
+	loc_wipe (&oldloc);
+	loc_wipe (&newloc);
+
+	if (oldpinode)
+		inode_unref (oldpinode);
+
+	if (newpinode)
+		inode_unref (newpinode);
+
+	glfs_subvol_done (fs, subvol);
+
+	return ret;
+}
diff --git a/api/src/glfs-handles.h b/api/src/glfs-handles.h
new file mode 100644
index 000000000..437f2cbc8
--- /dev/null
+++ b/api/src/glfs-handles.h
@@ -0,0 +1,143 @@
+/*
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLFS_HANDLES_H
+#define _GLFS_HANDLES_H
+
+#include "glfs.h"
+
+/* GLFS OBJECT BASED OPERATIONS
+ *
+ * The following APIs are introduced to provide an API framework that can work
+ * with gluster objects (files and directories), instead of absolute paths.
+ *
+ * The following API set can be related to the POSIX *at interfaces (like
+ * openat (2)). The intention of these APIs is to be able to operate based
+ * on parent object and looking up or creating child objects within, OR to be
+ * used on the actual object thus looked up or created, and retrieve information
+ * regarding the same.
+ *
+ * The APIs also provide for generating an opaque invariant handle to the
+ * object, that can later be used to lookup the object, instead of the regular
+ * glfs_h_* variants. The APIs that provide this behaviour are,
+ * glfs_h_extract_handle and glfs_h_create_from_handle.
+ *
+ * The object handles can be transitioned to fd based operations as supported
+ * by glfs.h calls, using the glfs_h_open call. This provides a way to move
+ * from objects to fd's akin to moving from path to fd for required operations.
+ *
+ * NOTE: The opaque invariant handle is the GFID of the object in reality, but
+ * maintained as an opaque data value, for potential internal changes to the
+ * same without impacting the caller.
+ *
+ * NOTE: Currently looking up an object can create multiple object handles to
+ * the same, i.e distinct glfs_object *. Hence each such looked up or received
+ * handle from other calls, would need to be closed. In the future, for a given
+ * object these pointers would be the same, and an ease of use API to forget all
+ * instances of this bject would be provided (instead of a per lookup close).
+ * This should not change the APIs in their current form.
+ *
+ */
+
+/* Values for valid falgs to be used when using XXXsetattr, to set multiple
+ attribute values passed via the related stat structure.
+ */
+#define GFAPI_SET_ATTR_MODE  0x1
+#define GFAPI_SET_ATTR_UID   0x2
+#define GFAPI_SET_ATTR_GID   0x4
+#define GFAPI_SET_ATTR_SIZE  0x8
+#define GFAPI_SET_ATTR_ATIME 0x10
+#define GFAPI_SET_ATTR_MTIME 0x20
+
+/* Handle length for object handles returned from glfs_h_extract_handle or
+ * glfs_h_create_from_handle */
+#define GFAPI_HANDLE_LENGTH 16
+
+__BEGIN_DECLS
+
+/*
+ * Notes:
+ *
+ * The file object handle. One per looked up, created file/directory
+ *
+ * This had been introduced to facilitate gfid/inode based gfapi
+ * - a requirement introduced by nfs-ganesha
+ */
+struct glfs_object;
+typedef struct glfs_object glfs_object_t;
+
+/* Handle based operations */
+/* Operations that generate handles */
+struct glfs_object *glfs_h_lookupat (struct glfs *fs,
+				     struct glfs_object *parent,
+				     const char *path, struct stat *stat);
+
+struct glfs_object *glfs_h_creat (struct glfs *fs, struct glfs_object *parent,
+				  const char *path, int flags, mode_t mode,
+				  struct stat *sb);
+
+struct glfs_object *glfs_h_mkdir (struct glfs *fs, struct glfs_object *parent,
+				  const char *path, mode_t flags,
+				  struct stat *sb);
+
+struct glfs_object *glfs_h_mknod (struct glfs *fs, struct glfs_object *parent,
+				  const char *path, mode_t mode, dev_t dev,
+				  struct stat *sb);
+
+struct glfs_object *glfs_h_symlink (struct glfs *fs, struct glfs_object *parent,
+				    const char *name, const char *data,
+				    struct stat *stat);
+
+/* Operations on the actual objects */
+int glfs_h_unlink (struct glfs *fs, struct glfs_object *parent,
+		   const char *path);
+
+int glfs_h_close (struct glfs_object *object);
+
+int glfs_caller_specific_init (void *uid_caller_key, void *gid_caller_key,
+			       void *future);
+
+int glfs_h_truncate (struct glfs *fs, struct glfs_object *object, off_t offset);
+
+int glfs_h_stat(struct glfs *fs, struct glfs_object *object, struct stat *stat);
+
+int glfs_h_getattrs (struct glfs *fs, struct glfs_object *object,
+		     struct stat *stat);
+
+int glfs_h_setattrs (struct glfs *fs, struct glfs_object *object,
+		     struct stat *sb, int valid);
+
+int glfs_h_readlink (struct glfs *fs, struct glfs_object *object, char *buf,
+		     size_t bufsiz);
+
+int glfs_h_link (struct glfs *fs, struct glfs_object *linktgt,
+		 struct glfs_object *parent, const char *name);
+
+int glfs_h_rename (struct glfs *fs, struct glfs_object *olddir,
+		   const char *oldname, struct glfs_object *newdir,
+		   const char *newname);
+
+/* Operations enabling opaque invariant handle to object transitions */
+ssize_t glfs_h_extract_handle (struct glfs_object *object,
+			       unsigned char *handle, int len);
+
+struct glfs_object *glfs_h_create_from_handle (struct glfs *fs,
+					       unsigned char *handle, int len,
+					       struct stat *stat);
+
+/* Operations enabling object handles to fd transitions */
+struct glfs_fd *glfs_h_opendir (struct glfs *fs, struct glfs_object *object);
+
+struct glfs_fd *glfs_h_open (struct glfs *fs, struct glfs_object *object,
+			     int flags);
+
+__END_DECLS
+
+#endif /* !_GLFS_HANDLES_H */
+\ No newline at end of file
diff --git a/api/src/glfs-internal.h b/api/src/glfs-internal.h
new file mode 100644
index 000000000..ec1d5579d
--- /dev/null
+++ b/api/src/glfs-internal.h
@@ -0,0 +1,200 @@
+/*
+  Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _GLFS_INTERNAL_H
+#define _GLFS_INTERNAL_H
+
+#include "xlator.h"
+
+#define GLFS_SYMLINK_MAX_FOLLOW 2048
+
+#define DEFAULT_REVAL_COUNT 1
+
+#define ESTALE_RETRY(ret,errno,reval,loc,label) do {	\
+	if (ret == -1 && errno == ESTALE) {	        \
+		if (reval < DEFAULT_REVAL_COUNT) {	\
+			reval++;			\
+			loc_wipe (loc);			\
+			goto label;			\
+		}					\
+	}						\
+	} while (0)
+
+#define GLFS_LOC_FILL_INODE(oinode, loc, label) do {   \
+	loc.inode = inode_ref (oinode);                \
+	uuid_copy (loc.gfid, oinode->gfid);            \
+	ret = glfs_loc_touchup (&loc);                 \
+	if (ret != 0) {                                \
+		errno = EINVAL;                        \
+		goto label;                            \
+	}                                              \
+	} while (0)
+
+#define GLFS_LOC_FILL_PINODE(pinode, loc, ret, errno, label, path) do {   \
+	loc.inode = inode_new (pinode->table);                            \
+	if (!loc.inode) {                                                 \
+		ret = -1;                                                 \
+		errno = ENOMEM;                                           \
+		goto label;                                               \
+	}                                                                 \
+	loc.parent = inode_ref (pinode);                                  \
+	loc.name = path;                                                  \
+	ret = glfs_loc_touchup (&loc);                                    \
+	if (ret != 0) {                                                   \
+		errno = EINVAL;                                           \
+		goto label;                                               \
+	}                                                                 \
+	} while (0)
+
+struct glfs;
+
+typedef int (*glfs_init_cbk) (struct glfs *fs, int ret);
+
+struct glfs {
+	char               *volname;
+
+	glusterfs_ctx_t    *ctx;
+
+	pthread_t           poller;
+
+	glfs_init_cbk       init_cbk;
+	pthread_mutex_t     mutex;
+	pthread_cond_t      cond;
+	int                 init;
+	int                 ret;
+	int                 err;
+
+	xlator_t           *active_subvol;
+	xlator_t           *next_subvol;
+	xlator_t           *old_subvol;
+
+	char               *oldvolfile;
+	ssize_t             oldvollen;
+
+	inode_t            *cwd;
+
+	uint32_t            dev_id; /* Used to fill st_dev in struct stat */
+
+	struct list_head    openfds;
+
+	gf_boolean_t        migration_in_progress;
+};
+
+struct glfs_fd {
+	struct list_head   openfds;
+	struct glfs       *fs;
+	off_t              offset;
+	fd_t              *fd; /* Currently guared by @fs->mutex. TODO: per-glfd lock */
+	struct list_head   entries;
+	gf_dirent_t       *next;
+	struct dirent     *readdirbuf;
+};
+
+/* glfs object handle introduced for the alternate gfapi implementation based
+   on glfs handles/gfid/inode
+*/
+struct glfs_object {
+        inode_t         *inode;
+        uuid_t          gfid;
+};
+
+#define DEFAULT_EVENT_POOL_SIZE           16384
+#define GF_MEMPOOL_COUNT_OF_DICT_T        4096
+#define GF_MEMPOOL_COUNT_OF_DATA_T        (GF_MEMPOOL_COUNT_OF_DICT_T * 4)
+#define GF_MEMPOOL_COUNT_OF_DATA_PAIR_T   (GF_MEMPOOL_COUNT_OF_DICT_T * 4)
+
+int glfs_mgmt_init (struct glfs *fs);
+void glfs_init_done (struct glfs *fs, int ret);
+int glfs_process_volfp (struct glfs *fs, FILE *fp);
+int glfs_resolve (struct glfs *fs, xlator_t *subvol, const char *path, loc_t *loc,
+		  struct iatt *iatt, int reval);
+int glfs_lresolve (struct glfs *fs, xlator_t *subvol, const char *path, loc_t *loc,
+		   struct iatt *iatt, int reval);
+fd_t *glfs_resolve_fd (struct glfs *fs, xlator_t *subvol, struct glfs_fd *glfd);
+
+fd_t *__glfs_migrate_fd (struct glfs *fs, xlator_t *subvol, struct glfs_fd *glfd);
+
+int glfs_first_lookup (xlator_t *subvol);
+
+static inline void
+__glfs_entry_fs (struct glfs *fs)
+{
+	THIS = fs->ctx->master;
+}
+
+
+static inline void
+__glfs_entry_fd (struct glfs_fd *fd)
+{
+	THIS = fd->fd->inode->table->xl->ctx->master;
+}
+
+
+/*
+  By default all lock attempts from user context must
+  use glfs_lock() and glfs_unlock(). This allows
+  for a safe implementation of graph migration where
+  we can give up the mutex during syncop calls so
+  that bottom up calls (particularly CHILD_UP notify)
+  can do a mutex_lock() on @glfs without deadlocking
+  the filesystem
+*/
+static inline int
+glfs_lock (struct glfs *fs)
+{
+	pthread_mutex_lock (&fs->mutex);
+
+	while (!fs->init)
+		pthread_cond_wait (&fs->cond, &fs->mutex);
+
+	while (fs->migration_in_progress)
+		pthread_cond_wait (&fs->cond, &fs->mutex);
+
+	return 0;
+}
+
+
+static inline void
+glfs_unlock (struct glfs *fs)
+{
+	pthread_mutex_unlock (&fs->mutex);
+}
+
+
+void glfs_fd_destroy (struct glfs_fd *glfd);
+
+struct glfs_fd *glfs_fd_new (struct glfs *fs);
+void glfs_fd_bind (struct glfs_fd *glfd);
+
+xlator_t * glfs_active_subvol (struct glfs *fs);
+xlator_t * __glfs_active_subvol (struct glfs *fs);
+void glfs_subvol_done (struct glfs *fs, xlator_t *subvol);
+
+inode_t * glfs_refresh_inode (xlator_t *subvol, inode_t *inode);
+
+inode_t *glfs_cwd_get (struct glfs *fs);
+int glfs_cwd_set (struct glfs *fs, inode_t *inode);
+inode_t *glfs_resolve_inode (struct glfs *fs, xlator_t *subvol,
+			     struct glfs_object *object);
+int glfs_create_object (loc_t *loc, struct glfs_object **retobject);
+int __glfs_cwd_set (struct glfs *fs, inode_t *inode);
+
+int glfs_resolve_base (struct glfs *fs, xlator_t *subvol, inode_t *inode,
+		       struct iatt *iatt);
+int glfs_resolve_at (struct glfs *fs, xlator_t *subvol, inode_t *at,
+                     const char *origpath, loc_t *loc, struct iatt *iatt,
+                     int follow, int reval);
+int glfs_loc_touchup (loc_t *loc);
+void glfs_iatt_to_stat (struct glfs *fs, struct iatt *iatt, struct stat *stat);
+int glfs_loc_link (loc_t *loc, struct iatt *iatt);
+int glfs_loc_unlink (loc_t *loc);
+
+#endif /* !_GLFS_INTERNAL_H */
diff --git a/api/src/glfs-master.c b/api/src/glfs-master.c
new file mode 100644
index 000000000..c02534c18
--- /dev/null
+++ b/api/src/glfs-master.c
@@ -0,0 +1,154 @@
+/*
+  Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <limits.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "glusterfs.h"
+
+#include "glfs-internal.h"
+#include "glfs-mem-types.h"
+
+
+int
+glfs_graph_setup (struct glfs *fs, glusterfs_graph_t *graph)
+{
+	xlator_t      *new_subvol = NULL;
+	xlator_t      *old_subvol = NULL;
+	inode_table_t *itable = NULL;
+	int            ret = -1;
+
+	new_subvol = graph->top;
+
+	/* This is called in a bottom-up context, it should specifically
+	   NOT be glfs_lock()
+	*/
+	pthread_mutex_lock (&fs->mutex);
+	{
+		if (new_subvol->switched ||
+		    new_subvol == fs->active_subvol ||
+		    new_subvol == fs->next_subvol) {
+			/* Spurious CHILD_UP event on old graph */
+			ret = 0;
+			goto unlock;
+		}
+
+		if (!new_subvol->itable) {
+			itable = inode_table_new (131072, new_subvol);
+			if (!itable) {
+				errno = ENOMEM;
+				ret = -1;
+				goto unlock;
+			}
+
+			new_subvol->itable = itable;
+		}
+
+		old_subvol = fs->next_subvol;
+		fs->next_subvol = new_subvol;
+		fs->next_subvol->winds++; /* first ref */
+		ret = 0;
+	}
+unlock:
+	pthread_mutex_unlock (&fs->mutex);
+
+	if (old_subvol)
+		/* wasn't picked up so far, skip */
+		glfs_subvol_done (fs, old_subvol);
+
+	return ret;
+}
+
+
+int
+notify (xlator_t *this, int event, void *data, ...)
+{
+	glusterfs_graph_t   *graph = NULL;
+	struct glfs	    *fs = NULL;
+
+	graph = data;
+	fs = this->private;
+
+	switch (event) {
+	case GF_EVENT_GRAPH_NEW:
+		gf_log (this->name, GF_LOG_INFO, "New graph %s (%d) coming up",
+			uuid_utoa ((unsigned char *)graph->graph_uuid),
+			graph->id);
+		break;
+	case GF_EVENT_CHILD_UP:
+		glfs_graph_setup (fs, graph);
+		glfs_init_done (fs, 0);
+		break;
+	case GF_EVENT_CHILD_DOWN:
+		glfs_graph_setup (fs, graph);
+		glfs_init_done (fs, 1);
+		break;
+	case GF_EVENT_CHILD_CONNECTING:
+		break;
+	default:
+		gf_log (this->name, GF_LOG_DEBUG,
+			"got notify event %d", event);
+		break;
+	}
+
+	return 0;
+}
+
+
+int
+mem_acct_init (xlator_t *this)
+{
+	int ret = -1;
+
+	if (!this)
+		return ret;
+
+	ret = xlator_mem_acct_init (this, glfs_mt_end + 1);
+	if (ret) {
+		gf_log (this->name, GF_LOG_ERROR, "Failed to initialise "
+                        "memory accounting");
+		return ret;
+	}
+
+	return 0;
+}
+
+
+int
+init (xlator_t *this)
+{
+	return 0;
+}
+
+
+void
+fini (xlator_t *this)
+{
+
+}
+
+
+struct xlator_dumpops dumpops;
+
+
+struct xlator_fops fops;
+
+
+struct xlator_cbks cbks;
diff --git a/api/src/glfs-mem-types.h b/api/src/glfs-mem-types.h
new file mode 100644
index 000000000..3301b3da5
--- /dev/null
+++ b/api/src/glfs-mem-types.h
@@ -0,0 +1,32 @@
+/*
+  Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLFS_MEM_TYPES_H
+#define _GLFS_MEM_TYPES_H
+
+#include "mem-types.h"
+
+#define GF_MEM_TYPE_START (gf_common_mt_end + 1)
+
+enum glfs_mem_types_ {
+        glfs_mt_glfs_t = GF_MEM_TYPE_START,
+        glfs_mt_call_pool_t,
+        glfs_mt_xlator_t,
+	glfs_mt_glfs_fd_t,
+	glfs_mt_glfs_io_t,
+	glfs_mt_volfile_t,
+	glfs_mt_xlator_cmdline_option_t,
+	glfs_mt_glfs_object_t,
+	glfs_mt_readdirbuf_t,
+	glfs_mt_end
+
+};
+#endif
+
diff --git a/api/src/glfs-mgmt.c b/api/src/glfs-mgmt.c
new file mode 100644
index 000000000..6843e9cb3
--- /dev/null
+++ b/api/src/glfs-mgmt.c
@@ -0,0 +1,543 @@
+/*
+  Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <pthread.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif /* _CONFIG_H */
+
+#include "glusterfs.h"
+#include "stack.h"
+#include "dict.h"
+#include "event.h"
+#include "defaults.h"
+
+#include "rpc-clnt.h"
+#include "protocol-common.h"
+#include "glusterfs3.h"
+#include "portmap-xdr.h"
+#include "xdr-generic.h"
+
+#include "syncop.h"
+#include "xlator.h"
+
+#include "glfs-internal.h"
+#include "glfs-mem-types.h"
+
+
+int glfs_volfile_fetch (struct glfs *fs);
+
+int
+glfs_process_volfp (struct glfs *fs, FILE *fp)
+{
+	glusterfs_graph_t  *graph = NULL;
+	int		    ret = -1;
+	xlator_t	   *trav = NULL;
+	glusterfs_ctx_t	   *ctx = NULL;
+
+	ctx = fs->ctx;
+	graph = glusterfs_graph_construct (fp);
+	if (!graph) {
+		gf_log ("glfs", GF_LOG_ERROR, "failed to construct the graph");
+		goto out;
+	}
+
+	for (trav = graph->first; trav; trav = trav->next) {
+		if (strcmp (trav->type, "mount/fuse") == 0) {
+			gf_log ("glfs", GF_LOG_ERROR,
+				"fuse xlator cannot be specified "
+				"in volume file");
+			goto out;
+		}
+	}
+
+	ret = glusterfs_graph_prepare (graph, ctx);
+	if (ret) {
+		glusterfs_graph_destroy (graph);
+		goto out;
+	}
+
+	ret = glusterfs_graph_activate (graph, ctx);
+
+	if (ret) {
+		glusterfs_graph_destroy (graph);
+		goto out;
+	}
+
+	ret = 0;
+out:
+	if (fp)
+		fclose (fp);
+
+	if (!ctx->active) {
+		ret = -1;
+	}
+
+	return ret;
+}
+
+
+int
+mgmt_cbk_spec (struct rpc_clnt *rpc, void *mydata, void *data)
+{
+	struct glfs *fs = NULL;
+	xlator_t    *this = NULL;
+
+	this = mydata;
+	fs = this->private;
+
+	glfs_volfile_fetch (fs);
+
+	return 0;
+}
+
+
+int
+mgmt_cbk_event (struct rpc_clnt *rpc, void *mydata, void *data)
+{
+	return 0;
+}
+
+
+rpcclnt_cb_actor_t mgmt_cbk_actors[] = {
+	[GF_CBK_FETCHSPEC] = {"FETCHSPEC", GF_CBK_FETCHSPEC, mgmt_cbk_spec },
+	[GF_CBK_EVENT_NOTIFY] = {"EVENTNOTIFY", GF_CBK_EVENT_NOTIFY,
+				 mgmt_cbk_event},
+};
+
+
+struct rpcclnt_cb_program mgmt_cbk_prog = {
+	.progname  = "GlusterFS Callback",
+	.prognum   = GLUSTER_CBK_PROGRAM,
+	.progver   = GLUSTER_CBK_VERSION,
+	.actors	   = mgmt_cbk_actors,
+	.numactors = GF_CBK_MAXVALUE,
+};
+
+char *clnt_handshake_procs[GF_HNDSK_MAXVALUE] = {
+	[GF_HNDSK_NULL]		= "NULL",
+	[GF_HNDSK_SETVOLUME]	= "SETVOLUME",
+	[GF_HNDSK_GETSPEC]	= "GETSPEC",
+	[GF_HNDSK_PING]		= "PING",
+	[GF_HNDSK_EVENT_NOTIFY] = "EVENTNOTIFY",
+};
+
+rpc_clnt_prog_t clnt_handshake_prog = {
+	.progname  = "GlusterFS Handshake",
+	.prognum   = GLUSTER_HNDSK_PROGRAM,
+	.progver   = GLUSTER_HNDSK_VERSION,
+	.procnames = clnt_handshake_procs,
+};
+
+
+int
+mgmt_submit_request (void *req, call_frame_t *frame,
+		     glusterfs_ctx_t *ctx,
+		     rpc_clnt_prog_t *prog, int procnum,
+		     fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
+{
+	int			ret	    = -1;
+	int			count	   = 0;
+	struct iovec		iov	    = {0, };
+	struct iobuf		*iobuf = NULL;
+	struct iobref		*iobref = NULL;
+	ssize_t			xdr_size = 0;
+
+	iobref = iobref_new ();
+	if (!iobref) {
+		goto out;
+	}
+
+	if (req) {
+		xdr_size = xdr_sizeof (xdrproc, req);
+
+		iobuf = iobuf_get2 (ctx->iobuf_pool, xdr_size);
+		if (!iobuf) {
+			goto out;
+		};
+
+		iobref_add (iobref, iobuf);
+
+		iov.iov_base = iobuf->ptr;
+		iov.iov_len  = iobuf_pagesize (iobuf);
+
+		/* Create the xdr payload */
+		ret = xdr_serialize_generic (iov, req, xdrproc);
+		if (ret == -1) {
+			gf_log (THIS->name, GF_LOG_WARNING,
+				"failed to create XDR payload");
+			goto out;
+		}
+		iov.iov_len = ret;
+		count = 1;
+	}
+
+	/* Send the msg */
+	ret = rpc_clnt_submit (ctx->mgmt, prog, procnum, cbkfn,
+			       &iov, count,
+			       NULL, 0, iobref, frame, NULL, 0, NULL, 0, NULL);
+
+out:
+	if (iobref)
+		iobref_unref (iobref);
+
+	if (iobuf)
+		iobuf_unref (iobuf);
+	return ret;
+}
+
+
+static int
+glusterfs_oldvolfile_update (struct glfs *fs, char *volfile, ssize_t size)
+{
+	int ret = -1;
+
+	fs->oldvollen = size;
+	if (!fs->oldvolfile) {
+		fs->oldvolfile = GF_CALLOC (1, size+1, glfs_mt_volfile_t);
+	} else {
+		fs->oldvolfile = GF_REALLOC (fs->oldvolfile, size+1);
+	}
+
+	if (!fs->oldvolfile) {
+		fs->oldvollen = 0;
+	} else {
+		memcpy (fs->oldvolfile, volfile, size);
+		fs->oldvollen = size;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+
+int
+mgmt_getspec_cbk (struct rpc_req *req, struct iovec *iov, int count,
+		  void *myframe)
+{
+	gf_getspec_rsp		 rsp   = {0,};
+	call_frame_t		*frame = NULL;
+	glusterfs_ctx_t		*ctx = NULL;
+	int			 ret   = 0;
+	ssize_t			 size = 0;
+	FILE			*tmpfp = NULL;
+	int                      need_retry = 0;
+	struct glfs		*fs = NULL;
+
+	frame = myframe;
+	ctx = frame->this->ctx;
+	fs = ((xlator_t *)ctx->master)->private;
+
+	if (-1 == req->rpc_status) {
+		ret = -1;
+		need_retry = 1;
+		goto out;
+	}
+
+	ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_getspec_rsp);
+	if (ret < 0) {
+		gf_log (frame->this->name, GF_LOG_ERROR, "XDR decoding error");
+		ret   = -1;
+		goto out;
+	}
+
+	if (-1 == rsp.op_ret) {
+		gf_log (frame->this->name, GF_LOG_ERROR,
+			"failed to get the 'volume file' from server");
+		ret = -1;
+                errno = rsp.op_errno;
+		goto out;
+	}
+
+	ret = 0;
+	size = rsp.op_ret;
+
+	if ((size == fs->oldvollen) &&
+	    (memcmp (fs->oldvolfile, rsp.spec, size) == 0)) {
+		gf_log (frame->this->name, GF_LOG_INFO,
+			"No change in volfile, continuing");
+		goto out;
+	}
+
+	tmpfp = tmpfile ();
+	if (!tmpfp) {
+		ret = -1;
+		goto out;
+	}
+
+	fwrite (rsp.spec, size, 1, tmpfp);
+	fflush (tmpfp);
+	if (ferror (tmpfp)) {
+		ret = -1;
+		goto out;
+	}
+
+	/*  Check if only options have changed. No need to reload the
+	*  volfile if topology hasn't changed.
+	*  glusterfs_volfile_reconfigure returns 3 possible return states
+	*  return 0	     =======> reconfiguration of options has succeeded
+	*  return 1	     =======> the graph has to be reconstructed and all the xlators should be inited
+	*  return -1(or -ve) =======> Some Internal Error occurred during the operation
+	*/
+
+	ret = glusterfs_volfile_reconfigure (fs->oldvollen, tmpfp, fs->ctx,
+                                             fs->oldvolfile);
+	if (ret == 0) {
+		gf_log ("glusterfsd-mgmt", GF_LOG_DEBUG,
+			"No need to re-load volfile, reconfigure done");
+		ret = glusterfs_oldvolfile_update (fs, rsp.spec, size);
+		goto out;
+	}
+
+	if (ret < 0) {
+		gf_log ("glusterfsd-mgmt", GF_LOG_DEBUG,
+			"Reconfigure failed !!");
+		goto out;
+	}
+
+	ret = glfs_process_volfp (fs, tmpfp);
+	/* tmpfp closed */
+	tmpfp = NULL;
+	if (ret)
+		goto out;
+
+	ret = glusterfs_oldvolfile_update (fs, rsp.spec, size);
+out:
+	STACK_DESTROY (frame->root);
+
+	if (rsp.spec)
+		free (rsp.spec);
+
+        // Stop if server is running at an unsupported op-version
+        if (ENOTSUP == ret) {
+                gf_log ("mgmt", GF_LOG_ERROR, "Server is operating at an "
+                        "op-version which is not supported");
+                errno = ENOTSUP;
+                glfs_init_done (fs, -1);
+        }
+
+	if (ret && ctx && !ctx->active) {
+		/* Do it only for the first time */
+		/* Failed to get the volume file, something wrong,
+		   restart the process */
+		gf_log ("glfs-mgmt", GF_LOG_ERROR,
+			"failed to fetch volume file (key:%s)",
+			ctx->cmd_args.volfile_id);
+		if (!need_retry) {
+                        if (!errno)
+                                errno = EINVAL;
+			glfs_init_done (fs, -1);
+                }
+	}
+
+	if (tmpfp)
+		fclose (tmpfp);
+
+	return 0;
+}
+
+
+int
+glfs_volfile_fetch (struct glfs *fs)
+{
+	cmd_args_t	 *cmd_args = NULL;
+	gf_getspec_req	  req = {0, };
+	int		  ret = 0;
+	call_frame_t	 *frame = NULL;
+	glusterfs_ctx_t	 *ctx = NULL;
+        dict_t           *dict = NULL;
+
+	ctx = fs->ctx;
+	cmd_args = &ctx->cmd_args;
+
+	frame = create_frame (THIS, ctx->pool);
+
+	req.key = cmd_args->volfile_id;
+	req.flags = 0;
+
+        dict = dict_new ();
+        if (!dict) {
+                ret = -1;
+                goto out;
+        }
+
+        // Set the supported min and max op-versions, so glusterd can make a
+        // decision
+        ret = dict_set_int32 (dict, "min-op-version", GD_OP_VERSION_MIN);
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_ERROR, "Failed to set min-op-version"
+                        " in request dict");
+                goto out;
+        }
+
+        ret = dict_set_int32 (dict, "max-op-version", GD_OP_VERSION_MAX);
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_ERROR, "Failed to set max-op-version"
+                        " in request dict");
+                goto out;
+        }
+
+        ret = dict_allocate_and_serialize (dict, &req.xdata.xdata_val,
+                                           &req.xdata.xdata_len);
+        if (ret < 0) {
+                gf_log (THIS->name, GF_LOG_ERROR,
+                        "Failed to serialize dictionary");
+                goto out;
+        }
+
+	ret = mgmt_submit_request (&req, frame, ctx, &clnt_handshake_prog,
+				   GF_HNDSK_GETSPEC, mgmt_getspec_cbk,
+				   (xdrproc_t)xdr_gf_getspec_req);
+out:
+        return ret;
+}
+
+
+static int
+mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
+		 void *data)
+{
+	xlator_t	*this = NULL;
+	cmd_args_t	*cmd_args = NULL;
+	glusterfs_ctx_t *ctx = NULL;
+	struct glfs	 *fs = NULL;
+	int		 ret = 0;
+
+	this = mydata;
+	ctx = this->ctx;
+	fs = ((xlator_t *)ctx->master)->private;
+	cmd_args = &ctx->cmd_args;
+
+	switch (event) {
+	case RPC_CLNT_DISCONNECT:
+		if (!ctx->active) {
+			cmd_args->max_connect_attempts--;
+			gf_log ("glfs-mgmt", GF_LOG_ERROR,
+				"failed to connect with remote-host: %s",
+				strerror (errno));
+			gf_log ("glfs-mgmt", GF_LOG_INFO,
+				"%d connect attempts left",
+				cmd_args->max_connect_attempts);
+			if (0 >= cmd_args->max_connect_attempts) {
+                                errno = ENOTCONN;
+				glfs_init_done (fs, -1);
+                        }
+		}
+		break;
+	case RPC_CLNT_CONNECT:
+		rpc_clnt_set_connected (&((struct rpc_clnt*)ctx->mgmt)->conn);
+
+		ret = glfs_volfile_fetch (fs);
+		if (ret && ctx && (ctx->active == NULL)) {
+			/* Do it only for the first time */
+			/* Exit the process.. there are some wrong options */
+			gf_log ("glfs-mgmt", GF_LOG_ERROR,
+				"failed to fetch volume file (key:%s)",
+				ctx->cmd_args.volfile_id);
+                        errno = EINVAL;
+			glfs_init_done (fs, -1);
+		}
+
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+
+int
+glusterfs_mgmt_notify (int32_t op, void *data, ...)
+{
+	int ret = 0;
+
+	switch (op)
+	{
+		case GF_EN_DEFRAG_STATUS:
+			break;
+
+		default:
+			break;
+	}
+
+	return ret;
+}
+
+
+int
+glfs_mgmt_init (struct glfs *fs)
+{
+	cmd_args_t		*cmd_args = NULL;
+	struct rpc_clnt		*rpc = NULL;
+	dict_t			*options = NULL;
+	int			ret = -1;
+	int			port = GF_DEFAULT_BASE_PORT;
+	char			*host = NULL;
+	glusterfs_ctx_t		*ctx = NULL;
+
+	ctx = fs->ctx;
+	cmd_args = &ctx->cmd_args;
+
+	if (ctx->mgmt)
+		return 0;
+
+	if (cmd_args->volfile_server_port)
+		port = cmd_args->volfile_server_port;
+
+	host = "localhost";
+	if (cmd_args->volfile_server)
+		host = cmd_args->volfile_server;
+
+	ret = rpc_transport_inet_options_build (&options, host, port);
+	if (ret)
+		goto out;
+
+	rpc = rpc_clnt_new (options, THIS->ctx, THIS->name, 8);
+	if (!rpc) {
+		ret = -1;
+		gf_log (THIS->name, GF_LOG_WARNING,
+			"failed to create rpc clnt");
+		goto out;
+	}
+
+	ret = rpc_clnt_register_notify (rpc, mgmt_rpc_notify, THIS);
+	if (ret) {
+		gf_log (THIS->name, GF_LOG_WARNING,
+			"failed to register notify function");
+		goto out;
+	}
+
+	ret = rpcclnt_cbk_program_register (rpc, &mgmt_cbk_prog, THIS);
+	if (ret) {
+		gf_log (THIS->name, GF_LOG_WARNING,
+			"failed to register callback function");
+		goto out;
+	}
+
+	ctx->notify = glusterfs_mgmt_notify;
+
+	/* This value should be set before doing the 'rpc_clnt_start()' as
+	   the notify function uses this variable */
+	ctx->mgmt = rpc;
+
+	ret = rpc_clnt_start (rpc);
+out:
+	return ret;
+}
+
diff --git a/api/src/glfs-resolve.c b/api/src/glfs-resolve.c
new file mode 100644
index 000000000..4ca2eb6fc
--- /dev/null
+++ b/api/src/glfs-resolve.c
@@ -0,0 +1,969 @@
+/*
+  Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <limits.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "stack.h"
+#include "event.h"
+#include "glfs-mem-types.h"
+#include "common-utils.h"
+#include "syncop.h"
+#include "call-stub.h"
+
+#include "glfs-internal.h"
+
+#define graphid_str(subvol) (uuid_utoa((unsigned char *)subvol->graph->graph_uuid))
+
+
+int
+glfs_first_lookup_safe (xlator_t *subvol)
+{
+	loc_t  loc = {0, };
+	int    ret = -1;
+
+	loc.inode = subvol->itable->root;
+	memset (loc.gfid, 0, 16);
+	loc.gfid[15] = 1;
+	loc.path = "/";
+	loc.name = "";
+
+	ret = syncop_lookup (subvol, &loc, 0, 0, 0, 0);
+
+	gf_log (subvol->name, GF_LOG_DEBUG, "first lookup complete %d", ret);
+
+	return ret;
+}
+
+
+int
+__glfs_first_lookup (struct glfs *fs, xlator_t *subvol)
+{
+	int ret = -1;
+
+	fs->migration_in_progress = 1;
+	pthread_mutex_unlock (&fs->mutex);
+	{
+		ret = glfs_first_lookup_safe (subvol);
+	}
+	pthread_mutex_lock (&fs->mutex);
+	fs->migration_in_progress = 0;
+	pthread_cond_broadcast (&fs->cond);
+
+	return ret;
+}
+
+
+inode_t *
+glfs_refresh_inode_safe (xlator_t *subvol, inode_t *oldinode)
+{
+	loc_t        loc = {0, };
+	int          ret = -1;
+	struct iatt  iatt = {0, };
+	inode_t     *newinode = NULL;
+
+
+	if (!oldinode)
+		return NULL;
+
+	if (oldinode->table->xl == subvol)
+		return inode_ref (oldinode);
+
+	newinode = inode_find (subvol->itable, oldinode->gfid);
+	if (newinode)
+		return newinode;
+
+	uuid_copy (loc.gfid, oldinode->gfid);
+	loc.inode = inode_new (subvol->itable);
+	if (!loc.inode)
+		return NULL;
+
+	ret = syncop_lookup (subvol, &loc, 0, &iatt, 0, 0);
+
+	if (ret) {
+		gf_log (subvol->name, GF_LOG_WARNING,
+			"inode refresh of %s failed: %s",
+			uuid_utoa (oldinode->gfid), strerror (errno));
+		loc_wipe (&loc);
+		return NULL;
+	}
+
+	newinode = inode_link (loc.inode, 0, 0, &iatt);
+	if (newinode)
+		inode_lookup (newinode);
+
+	loc_wipe (&loc);
+
+	return newinode;
+}
+
+
+inode_t *
+__glfs_refresh_inode (struct glfs *fs, xlator_t *subvol, inode_t *inode)
+{
+	inode_t *newinode = NULL;
+
+	fs->migration_in_progress = 1;
+	pthread_mutex_unlock (&fs->mutex);
+	{
+		newinode = glfs_refresh_inode_safe (subvol, inode);
+	}
+	pthread_mutex_lock (&fs->mutex);
+	fs->migration_in_progress = 0;
+	pthread_cond_broadcast (&fs->cond);
+
+	return newinode;
+}
+
+int
+glfs_loc_touchup (loc_t *loc)
+{
+	char *path = NULL;
+	int   ret = -1;
+	char *bn = NULL;
+
+	if (loc->parent)
+		ret = inode_path (loc->parent, loc->name, &path);
+	else
+		ret = inode_path (loc->inode, 0, &path);
+
+	loc->path = path;
+
+	if (ret < 0 || !path) {
+		ret = -1;
+		errno = ENOMEM;
+		goto out;
+	}
+
+	bn = strrchr (path, '/');
+	if (bn)
+		bn++;
+	loc->name = bn;
+	ret = 0;
+out:
+	return ret;
+}
+
+
+int
+glfs_resolve_symlink (struct glfs *fs, xlator_t *subvol, inode_t *inode,
+		      char **lpath)
+{
+	loc_t  loc = {0, };
+	char  *path = NULL;
+	char  *rpath = NULL;
+	int    ret = -1;
+
+	loc.inode = inode_ref (inode);
+	uuid_copy (loc.gfid, inode->gfid);
+	ret = inode_path (inode, NULL, &rpath);
+	if (ret < 0)
+		goto out;
+	loc.path = rpath;
+
+	ret = syncop_readlink (subvol, &loc, &path, 4096);
+
+	if (ret < 0)
+		goto out;
+
+	if (lpath)
+		*lpath = path;
+out:
+	loc_wipe (&loc);
+	return ret;
+}
+
+
+int
+glfs_resolve_base (struct glfs *fs, xlator_t *subvol, inode_t *inode,
+		   struct iatt *iatt)
+{
+	loc_t       loc = {0, };
+	int         ret = -1;
+	char       *path = NULL;
+
+	loc.inode = inode_ref (inode);
+	uuid_copy (loc.gfid, inode->gfid);
+
+	ret = inode_path (loc.inode, NULL, &path);
+	loc.path = path;
+	if (ret < 0)
+		goto out;
+
+	ret = syncop_lookup (subvol, &loc, NULL, iatt, NULL, NULL);
+out:
+	loc_wipe (&loc);
+
+	return ret;
+}
+
+
+inode_t *
+glfs_resolve_component (struct glfs *fs, xlator_t *subvol, inode_t *parent,
+			const char *component, struct iatt *iatt,
+			int force_lookup)
+{
+	loc_t        loc = {0, };
+	inode_t     *inode = NULL;
+	int          reval = 0;
+	int          ret = -1;
+	int          glret = -1;
+	struct iatt  ciatt = {0, };
+	uuid_t       gfid;
+	dict_t      *xattr_req = NULL;
+
+	loc.name = component;
+
+	loc.parent = inode_ref (parent);
+	uuid_copy (loc.pargfid, parent->gfid);
+
+
+	if (strcmp (component, ".") == 0)
+		loc.inode = inode_ref (parent);
+	else if (strcmp (component, "..") == 0)
+		loc.inode = inode_parent (parent, 0, 0);
+	else
+		loc.inode = inode_grep (parent->table, parent, component);
+
+	if (loc.inode) {
+		uuid_copy (loc.gfid, loc.inode->gfid);
+		reval = 1;
+
+		if (!force_lookup) {
+			inode = inode_ref (loc.inode);
+			ciatt.ia_type = inode->ia_type;
+			goto found;
+		}
+	} else {
+		uuid_generate (gfid);
+		loc.inode = inode_new (parent->table);
+	}
+
+	if (!loc.inode)
+		goto out;
+
+	glret = glfs_loc_touchup (&loc);
+	if (glret < 0) {
+		ret = -1;
+		goto out;
+	}
+
+	ret = syncop_lookup (subvol, &loc, NULL, &ciatt, NULL, NULL);
+	if (ret && reval) {
+		inode_unref (loc.inode);
+		loc.inode = inode_new (parent->table);
+		if (!loc.inode) {
+			errno = ENOMEM;
+			goto out;
+		}
+
+		xattr_req = dict_new ();
+		if (!xattr_req) {
+			errno = ENOMEM;
+			goto out;
+		}
+
+		uuid_generate (gfid);
+
+		ret = dict_set_static_bin (xattr_req, "gfid-req", gfid, 16);
+		if (ret) {
+			errno = ENOMEM;
+			goto out;
+		}
+
+		ret = syncop_lookup (subvol, &loc, xattr_req, &ciatt,
+				     NULL, NULL);
+	}
+	if (ret)
+		goto out;
+
+	inode = inode_link (loc.inode, loc.parent, component, &ciatt);
+found:
+	if (inode)
+		inode_lookup (inode);
+	if (iatt)
+		*iatt = ciatt;
+out:
+	if (xattr_req)
+		dict_unref (xattr_req);
+
+	loc_wipe (&loc);
+
+	return inode;
+}
+
+
+int
+glfs_resolve_at (struct glfs *fs, xlator_t *subvol, inode_t *at,
+		 const char *origpath, loc_t *loc, struct iatt *iatt,
+		 int follow, int reval)
+{
+	inode_t    *inode = NULL;
+	inode_t    *parent = NULL;
+	char       *saveptr = NULL;
+	char       *path = NULL;
+	char       *component = NULL;
+	char       *next_component = NULL;
+	int         ret = -1;
+	struct iatt ciatt = {0, };
+
+	path = gf_strdup (origpath);
+	if (!path) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	parent = NULL;
+	if (at && path[0] != '/') {
+		/* A relative resolution of a path which starts with '/'
+		   is equal to an absolute path resolution.
+		*/
+		inode = inode_ref (at);
+	} else {
+		inode = inode_ref (subvol->itable->root);
+
+		if (strcmp (path, "/") == 0)
+			glfs_resolve_base (fs, subvol, inode, &ciatt);
+	}
+
+	for (component = strtok_r (path, "/", &saveptr);
+	     component; component = next_component) {
+
+		next_component = strtok_r (NULL, "/", &saveptr);
+
+		if (parent)
+			inode_unref (parent);
+
+		parent = inode;
+
+		inode = glfs_resolve_component (fs, subvol, parent,
+						component, &ciatt,
+						/* force hard lookup on the last
+						   component, as the caller
+						   wants proper iatt filled
+						*/
+						(reval || (!next_component &&
+						iatt)));
+		if (!inode)
+			break;
+
+		if (IA_ISLNK (ciatt.ia_type) && (next_component || follow)) {
+			/* If the component is not the last piece,
+			   then following it is necessary even if
+			   not requested by the caller
+			*/
+			char *lpath = NULL;
+			loc_t sym_loc = {0,};
+
+			if (follow > GLFS_SYMLINK_MAX_FOLLOW) {
+				errno = ELOOP;
+				ret = -1;
+				if (inode) {
+					inode_unref (inode);
+					inode = NULL;
+				}
+				break;
+			}
+
+			ret = glfs_resolve_symlink (fs, subvol, inode, &lpath);
+			inode_unref (inode);
+			inode = NULL;
+			if (ret < 0)
+				break;
+
+			ret = glfs_resolve_at (fs, subvol, parent, lpath,
+					       &sym_loc,
+					       /* followed iatt becomes the
+						  component iatt
+					       */
+					       &ciatt,
+					       /* always recurisvely follow while
+						  following symlink
+					       */
+					       follow + 1, reval);
+			if (ret == 0)
+				inode = inode_ref (sym_loc.inode);
+			loc_wipe (&sym_loc);
+			GF_FREE (lpath);
+		}
+
+		if (!next_component)
+			break;
+
+		if (!IA_ISDIR (ciatt.ia_type)) {
+			/* next_component exists and this component is
+			   not a directory
+			*/
+			inode_unref (inode);
+			inode = NULL;
+			ret = -1;
+			errno = ENOTDIR;
+			break;
+		}
+	}
+
+	if (parent && next_component)
+		/* resolution failed mid-way */
+		goto out;
+
+	/* At this point, all components up to the last parent directory
+	   have been resolved successfully (@parent). Resolution of basename
+	   might have failed (@inode) if at all.
+	*/
+
+	loc->parent = parent;
+	if (parent) {
+		uuid_copy (loc->pargfid, parent->gfid);
+		loc->name = component;
+	}
+
+	loc->inode = inode;
+	if (inode) {
+		uuid_copy (loc->gfid, inode->gfid);
+		if (iatt)
+			*iatt = ciatt;
+		ret = 0;
+	}
+
+	glfs_loc_touchup (loc);
+out:
+	GF_FREE (path);
+
+	/* do NOT loc_wipe here as only last component might be missing */
+
+	return ret;
+}
+
+
+int
+glfs_resolve_path (struct glfs *fs, xlator_t *subvol, const char *origpath,
+		   loc_t *loc, struct iatt *iatt, int follow, int reval)
+{
+	int ret = -1;
+	inode_t *cwd = NULL;
+
+	if (origpath[0] == '/')
+		return glfs_resolve_at (fs, subvol, NULL, origpath, loc, iatt,
+					follow, reval);
+
+	cwd = glfs_cwd_get (fs);
+
+	ret = glfs_resolve_at (fs, subvol, cwd, origpath, loc, iatt,
+			       follow, reval);
+	if (cwd)
+		inode_unref (cwd);
+
+	return ret;
+}
+
+
+int
+glfs_resolve (struct glfs *fs, xlator_t *subvol, const char *origpath,
+	      loc_t *loc, struct iatt *iatt, int reval)
+{
+	int ret = -1;
+
+	ret = glfs_resolve_path (fs, subvol, origpath, loc, iatt, 1, reval);
+
+	return ret;
+}
+
+
+int
+glfs_lresolve (struct glfs *fs, xlator_t *subvol, const char *origpath,
+	       loc_t *loc, struct iatt *iatt, int reval)
+{
+	int ret = -1;
+
+	ret = glfs_resolve_path (fs, subvol, origpath, loc, iatt, 0, reval);
+
+	return ret;
+}
+
+
+int
+glfs_migrate_fd_locks_safe (struct glfs *fs, xlator_t *oldsubvol, fd_t *oldfd,
+			    xlator_t *newsubvol, fd_t *newfd)
+{
+	dict_t *lockinfo = NULL;
+	int ret = 0;
+	char uuid1[64];
+
+	if (!oldfd->lk_ctx || fd_lk_ctx_empty (oldfd->lk_ctx))
+		return 0;
+
+	newfd->lk_ctx = fd_lk_ctx_ref (oldfd->lk_ctx);
+
+	ret = syncop_fgetxattr (oldsubvol, oldfd, &lockinfo,
+				GF_XATTR_LOCKINFO_KEY);
+	if (ret < 0) {
+		gf_log (fs->volname, GF_LOG_WARNING,
+			"fgetxattr (%s) failed (%s) on graph %s (%d)",
+			uuid_utoa_r (oldfd->inode->gfid, uuid1),
+			strerror (errno),
+			graphid_str (oldsubvol), oldsubvol->graph->id);
+		goto out;
+	}
+
+	if (!dict_get (lockinfo, GF_XATTR_LOCKINFO_KEY)) {
+		gf_log (fs->volname, GF_LOG_WARNING,
+			"missing lokinfo key (%s) on graph %s (%d)",
+			uuid_utoa_r (oldfd->inode->gfid, uuid1),
+			graphid_str (oldsubvol), oldsubvol->graph->id);
+		goto out;
+	}
+
+	ret = syncop_fsetxattr (newsubvol, newfd, lockinfo, 0);
+	if (ret < 0) {
+		gf_log (fs->volname, GF_LOG_WARNING,
+			"fsetxattr (%s) failed (%s) on graph %s (%d)",
+			uuid_utoa_r (newfd->inode->gfid, uuid1),
+			strerror (errno),
+			graphid_str (newsubvol), newsubvol->graph->id);
+		goto out;
+	}
+out:
+	if (lockinfo)
+		dict_unref (lockinfo);
+	return ret;
+}
+
+
+fd_t *
+glfs_migrate_fd_safe (struct glfs *fs, xlator_t *newsubvol, fd_t *oldfd)
+{
+	fd_t *newfd = NULL;
+	inode_t *oldinode = NULL;
+	inode_t *newinode = NULL;
+	xlator_t *oldsubvol = NULL;
+	int ret = -1;
+	loc_t loc = {0, };
+	char uuid1[64];
+
+
+	oldinode = oldfd->inode;
+	oldsubvol = oldinode->table->xl;
+
+	if (oldsubvol == newsubvol)
+		return fd_ref (oldfd);
+
+	if (!oldsubvol->switched) {
+		ret = syncop_fsync (oldsubvol, oldfd, 0);
+		if (ret) {
+			gf_log (fs->volname, GF_LOG_WARNING,
+				"fsync() failed (%s) on %s graph %s (%d)",
+				strerror (errno),
+				uuid_utoa_r (oldfd->inode->gfid, uuid1),
+				graphid_str (oldsubvol), oldsubvol->graph->id);
+		}
+	}
+
+	newinode = glfs_refresh_inode_safe (newsubvol, oldinode);
+	if (!newinode) {
+		gf_log (fs->volname, GF_LOG_WARNING,
+			"inode (%s) refresh failed (%s) on graph %s (%d)",
+			uuid_utoa_r (oldinode->gfid, uuid1),
+			strerror (errno),
+			graphid_str (newsubvol), newsubvol->graph->id);
+		goto out;
+	}
+
+	newfd = fd_create (newinode, getpid());
+	if (!newfd) {
+		gf_log (fs->volname, GF_LOG_WARNING,
+			"fd_create (%s) failed (%s) on graph %s (%d)",
+			uuid_utoa_r (newinode->gfid, uuid1),
+			strerror (errno),
+			graphid_str (newsubvol), newsubvol->graph->id);
+		goto out;
+	}
+
+	loc.inode = inode_ref (newinode);
+
+        ret = inode_path (oldfd->inode, NULL, (char **)&loc.path);
+        if (ret < 0) {
+                gf_log (fs->volname, GF_LOG_INFO, "inode_path failed");
+                goto out;
+        }
+
+        uuid_copy (loc.gfid, oldinode->gfid);
+
+
+	if (IA_ISDIR (oldinode->ia_type))
+		ret = syncop_opendir (newsubvol, &loc, newfd);
+	else
+		ret = syncop_open (newsubvol, &loc,
+				   oldfd->flags & ~(O_TRUNC|O_EXCL|O_CREAT),
+				   newfd);
+	loc_wipe (&loc);
+
+	if (ret) {
+		gf_log (fs->volname, GF_LOG_WARNING,
+			"syncop_open%s (%s) failed (%s) on graph %s (%d)",
+			IA_ISDIR (oldinode->ia_type) ? "dir" : "",
+			uuid_utoa_r (newinode->gfid, uuid1),
+			strerror (errno),
+			graphid_str (newsubvol), newsubvol->graph->id);
+		goto out;
+	}
+
+	ret = glfs_migrate_fd_locks_safe (fs, oldsubvol, oldfd, newsubvol,
+					  newfd);
+
+	if (ret) {
+		gf_log (fs->volname, GF_LOG_WARNING,
+			"lock migration (%s) failed (%s) on graph %s (%d)",
+			uuid_utoa_r (newinode->gfid, uuid1),
+			strerror (errno),
+			graphid_str (newsubvol), newsubvol->graph->id);
+		goto out;
+	}
+
+        newfd->flags = oldfd->flags;
+	fd_bind (newfd);
+out:
+	if (newinode)
+		inode_unref (newinode);
+
+	if (ret) {
+		fd_unref (newfd);
+		newfd = NULL;
+	}
+
+	return newfd;
+}
+
+
+fd_t *
+__glfs_migrate_fd (struct glfs *fs, xlator_t *newsubvol, struct glfs_fd *glfd)
+{
+	fd_t *oldfd = NULL;
+	fd_t *newfd = NULL;
+
+	oldfd = glfd->fd;
+
+	fs->migration_in_progress = 1;
+	pthread_mutex_unlock (&fs->mutex);
+	{
+		newfd = glfs_migrate_fd_safe (fs, newsubvol, oldfd);
+	}
+	pthread_mutex_lock (&fs->mutex);
+	fs->migration_in_progress = 0;
+	pthread_cond_broadcast (&fs->cond);
+
+	return newfd;
+}
+
+
+fd_t *
+__glfs_resolve_fd (struct glfs *fs, xlator_t *subvol, struct glfs_fd *glfd)
+{
+	fd_t *fd = NULL;
+
+	if (glfd->fd->inode->table->xl == subvol)
+		return fd_ref (glfd->fd);
+
+	fd = __glfs_migrate_fd (fs, subvol, glfd);
+	if (!fd)
+		return NULL;
+
+	if (subvol == fs->active_subvol) {
+		fd_unref (glfd->fd);
+		glfd->fd = fd_ref (fd);
+	}
+
+	return fd;
+}
+
+
+fd_t *
+glfs_resolve_fd (struct glfs *fs, xlator_t *subvol, struct glfs_fd *glfd)
+{
+	fd_t *fd = NULL;
+
+	glfs_lock (fs);
+	{
+		fd = __glfs_resolve_fd (fs, subvol, glfd);
+	}
+	glfs_unlock (fs);
+
+	return fd;
+}
+
+
+void
+__glfs_migrate_openfds (struct glfs *fs, xlator_t *subvol)
+{
+	struct glfs_fd *glfd = NULL;
+	fd_t *fd = NULL;
+
+	list_for_each_entry (glfd, &fs->openfds, openfds) {
+		if (uuid_is_null (glfd->fd->inode->gfid)) {
+			gf_log (fs->volname, GF_LOG_INFO,
+				"skipping openfd %p/%p in graph %s (%d)",
+				glfd, glfd->fd,	graphid_str(subvol),
+				subvol->graph->id);
+			/* create in progress, defer */
+			continue;
+		}
+
+		fd = __glfs_migrate_fd (fs, subvol, glfd);
+		if (fd) {
+			fd_unref (glfd->fd);
+			glfd->fd = fd;
+		}
+	}
+}
+
+
+xlator_t *
+__glfs_active_subvol (struct glfs *fs)
+{
+	xlator_t      *new_subvol = NULL;
+	int            ret = -1;
+	inode_t       *new_cwd = NULL;
+
+	if (!fs->next_subvol)
+		return fs->active_subvol;
+
+	new_subvol = fs->next_subvol;
+
+	ret = __glfs_first_lookup (fs, new_subvol);
+	if (ret) {
+		gf_log (fs->volname, GF_LOG_INFO,
+			"first lookup on graph %s (%d) failed (%s)",
+			graphid_str (new_subvol), new_subvol->graph->id,
+			strerror (errno));
+		return NULL;
+	}
+
+	if (fs->cwd) {
+		new_cwd = __glfs_refresh_inode (fs, new_subvol, fs->cwd);
+
+		if (!new_cwd) {
+			char buf1[64];
+			gf_log (fs->volname, GF_LOG_INFO,
+				"cwd refresh of %s graph %s (%d) failed (%s)",
+				uuid_utoa_r (fs->cwd->gfid, buf1),
+				graphid_str (new_subvol),
+				new_subvol->graph->id, strerror (errno));
+			return NULL;
+		}
+	}
+
+	__glfs_migrate_openfds (fs, new_subvol);
+
+	/* switching @active_subvol and @cwd
+	   should be atomic
+	*/
+	fs->old_subvol = fs->active_subvol;
+	fs->active_subvol = fs->next_subvol;
+	fs->next_subvol = NULL;
+
+	if (new_cwd) {
+		__glfs_cwd_set (fs, new_cwd);
+		inode_unref (new_cwd);
+	}
+
+	gf_log (fs->volname, GF_LOG_INFO, "switched to graph %s (%d)",
+		graphid_str (new_subvol), new_subvol->graph->id);
+
+	return new_subvol;
+}
+
+xlator_t *
+glfs_active_subvol (struct glfs *fs)
+{
+	xlator_t      *subvol = NULL;
+	xlator_t      *old_subvol = NULL;
+
+	glfs_lock (fs);
+	{
+		subvol = __glfs_active_subvol (fs);
+
+		if (subvol)
+			subvol->winds++;
+
+		if (fs->old_subvol) {
+			old_subvol = fs->old_subvol;
+			fs->old_subvol = NULL;
+			old_subvol->switched = 1;
+		}
+	}
+	glfs_unlock (fs);
+
+	if (old_subvol)
+		glfs_subvol_done (fs, old_subvol);
+
+	return subvol;
+}
+
+
+void
+glfs_subvol_done (struct glfs *fs, xlator_t *subvol)
+{
+	int ref = 0;
+	xlator_t *active_subvol = NULL;
+
+	if (!subvol)
+		return;
+
+	glfs_lock (fs);
+	{
+		ref = (--subvol->winds);
+		active_subvol = fs->active_subvol;
+	}
+	glfs_unlock (fs);
+
+	if (ref == 0) {
+		assert (subvol != active_subvol);
+		xlator_notify (subvol, GF_EVENT_PARENT_DOWN, subvol, NULL);
+	}
+}
+
+
+int
+__glfs_cwd_set (struct glfs *fs, inode_t *inode)
+{
+	if (inode->table->xl != fs->active_subvol) {
+		inode = __glfs_refresh_inode (fs, fs->active_subvol, inode);
+		if (!inode)
+			return -1;
+	} else {
+		inode_ref (inode);
+	}
+
+	if (fs->cwd)
+		inode_unref (fs->cwd);
+
+	fs->cwd = inode;
+
+	return 0;
+}
+
+
+int
+glfs_cwd_set (struct glfs *fs, inode_t *inode)
+{
+	int ret = 0;
+
+	glfs_lock (fs);
+	{
+		ret = __glfs_cwd_set (fs, inode);
+	}
+	glfs_unlock (fs);
+
+	return ret;
+}
+
+
+inode_t *
+__glfs_cwd_get (struct glfs *fs)
+{
+	inode_t *cwd = NULL;
+
+	if (!fs->cwd)
+		return NULL;
+
+	if (fs->cwd->table->xl == fs->active_subvol) {
+		cwd = inode_ref (fs->cwd);
+		return cwd;
+	}
+
+	cwd = __glfs_refresh_inode (fs, fs->active_subvol, fs->cwd);
+
+	return cwd;
+}
+
+inode_t *
+glfs_cwd_get (struct glfs *fs)
+{
+	inode_t *cwd = NULL;
+
+	glfs_lock (fs);
+	{
+		cwd = __glfs_cwd_get (fs);
+	}
+	glfs_unlock (fs);
+
+	return cwd;
+}
+
+inode_t *
+__glfs_resolve_inode (struct glfs *fs, xlator_t *subvol,
+		    struct glfs_object *object)
+{
+	inode_t *inode = NULL;
+
+	if (object->inode->table->xl == subvol)
+		return inode_ref (object->inode);
+
+	inode = __glfs_refresh_inode (fs, fs->active_subvol,
+					object->inode);
+	if (!inode)
+		return NULL;
+
+	if (subvol == fs->active_subvol) {
+		inode_unref (object->inode);
+		object->inode = inode_ref (inode);
+	}
+
+	return inode;
+}
+
+inode_t *
+glfs_resolve_inode (struct glfs *fs, xlator_t *subvol,
+		    struct glfs_object *object)
+{
+	inode_t *inode = NULL;
+
+	glfs_lock (fs);
+	{
+		inode = __glfs_resolve_inode(fs, subvol, object);
+	}
+	glfs_unlock (fs);
+
+	return inode;
+}
+
+int
+glfs_create_object (loc_t *loc, struct glfs_object **retobject)
+{
+	struct glfs_object *object = NULL;
+
+	object = GF_CALLOC (1, sizeof(struct glfs_object),
+			    glfs_mt_glfs_object_t);
+	if (object == NULL) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	object->inode = loc->inode;
+	uuid_copy (object->gfid, object->inode->gfid);
+
+	/* we hold the reference */
+	loc->inode = NULL;
+
+	*retobject = object;
+
+	return 0;
+}
diff --git a/api/src/glfs.c b/api/src/glfs.c
new file mode 100644
index 000000000..29ed47c0c
--- /dev/null
+++ b/api/src/glfs.c
@@ -0,0 +1,673 @@
+/*
+  Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+
+/*
+  TODO:
+  - merge locks in glfs_posix_lock for lock self-healing
+  - set proper pid/lk_owner to call frames (currently buried in syncop)
+  - fix logging.c/h to store logfp and loglevel in glusterfs_ctx_t and
+    reach it via THIS.
+  - update syncop functions to accept/return xdata. ???
+  - protocol/client to reconnect immediately after portmap disconnect.
+  - handle SEEK_END failure in _lseek()
+  - handle umask (per filesystem?)
+  - make itables LRU based
+  - 0-copy for readv/writev
+  - reconcile the open/creat mess
+*/
+
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <limits.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "stack.h"
+#include "event.h"
+#include "glfs-mem-types.h"
+#include "common-utils.h"
+#include "syncop.h"
+#include "call-stub.h"
+
+#include "glfs.h"
+#include "glfs-internal.h"
+#include "hashfn.h"
+#include "rpc-clnt.h"
+
+
+static gf_boolean_t
+vol_assigned (cmd_args_t *args)
+{
+	return args->volfile || args->volfile_server;
+}
+
+
+static int
+glusterfs_ctx_defaults_init (glusterfs_ctx_t *ctx)
+{
+	call_pool_t   *pool = NULL;
+	int	       ret = -1;
+
+	xlator_mem_acct_init (THIS, glfs_mt_end + 1);
+
+	ctx->process_uuid = generate_glusterfs_ctx_id ();
+	if (!ctx->process_uuid) {
+		goto err;
+	}
+
+	ctx->page_size	= 128 * GF_UNIT_KB;
+
+	ctx->iobuf_pool = iobuf_pool_new ();
+	if (!ctx->iobuf_pool) {
+		goto err;
+	}
+
+	ctx->event_pool = event_pool_new (DEFAULT_EVENT_POOL_SIZE);
+	if (!ctx->event_pool) {
+		goto err;
+	}
+
+	ctx->env = syncenv_new (0, 0, 0);
+	if (!ctx->env) {
+		goto err;
+	}
+
+	pool = GF_CALLOC (1, sizeof (call_pool_t),
+			  glfs_mt_call_pool_t);
+	if (!pool) {
+		goto err;
+	}
+
+	/* frame_mem_pool size 112 * 4k */
+	pool->frame_mem_pool = mem_pool_new (call_frame_t, 4096);
+	if (!pool->frame_mem_pool) {
+		goto err;
+	}
+	/* stack_mem_pool size 256 * 1024 */
+	pool->stack_mem_pool = mem_pool_new (call_stack_t, 1024);
+	if (!pool->stack_mem_pool) {
+		goto err;
+	}
+
+	ctx->stub_mem_pool = mem_pool_new (call_stub_t, 1024);
+	if (!ctx->stub_mem_pool) {
+		goto err;
+	}
+
+	ctx->dict_pool = mem_pool_new (dict_t, GF_MEMPOOL_COUNT_OF_DICT_T);
+	if (!ctx->dict_pool)
+		goto err;
+
+	ctx->dict_pair_pool = mem_pool_new (data_pair_t,
+					    GF_MEMPOOL_COUNT_OF_DATA_PAIR_T);
+	if (!ctx->dict_pair_pool)
+		goto err;
+
+	ctx->dict_data_pool = mem_pool_new (data_t, GF_MEMPOOL_COUNT_OF_DATA_T);
+	if (!ctx->dict_data_pool)
+		goto err;
+
+	INIT_LIST_HEAD (&pool->all_frames);
+	INIT_LIST_HEAD (&ctx->cmd_args.xlator_options);
+	LOCK_INIT (&pool->lock);
+	ctx->pool = pool;
+
+	pthread_mutex_init (&(ctx->lock), NULL);
+
+	ret = 0;
+err:
+	if (ret && pool) {
+		if (pool->frame_mem_pool)
+			mem_pool_destroy (pool->frame_mem_pool);
+		if (pool->stack_mem_pool)
+			mem_pool_destroy (pool->stack_mem_pool);
+		GF_FREE (pool);
+	}
+
+	if (ret && ctx) {
+		if (ctx->stub_mem_pool)
+			mem_pool_destroy (ctx->stub_mem_pool);
+		if (ctx->dict_pool)
+			mem_pool_destroy (ctx->dict_pool);
+		if (ctx->dict_data_pool)
+			mem_pool_destroy (ctx->dict_data_pool);
+		if (ctx->dict_pair_pool)
+			mem_pool_destroy (ctx->dict_pair_pool);
+	}
+
+	return ret;
+}
+
+
+static int
+create_master (struct glfs *fs)
+{
+	int		 ret = 0;
+	xlator_t	*master = NULL;
+
+	master = GF_CALLOC (1, sizeof (*master),
+			    glfs_mt_xlator_t);
+	if (!master)
+		goto err;
+
+	master->name = gf_strdup ("gfapi");
+	if (!master->name)
+		goto err;
+
+	if (xlator_set_type (master, "mount/api") == -1) {
+		gf_log ("glfs", GF_LOG_ERROR,
+			"master xlator for %s initialization failed",
+			fs->volname);
+		goto err;
+	}
+
+	master->ctx	 = fs->ctx;
+	master->private	 = fs;
+	master->options	 = get_new_dict ();
+	if (!master->options)
+		goto err;
+
+
+	ret = xlator_init (master);
+	if (ret) {
+		gf_log ("glfs", GF_LOG_ERROR,
+			"failed to initialize gfapi translator");
+		goto err;
+	}
+
+	fs->ctx->master = master;
+	THIS = master;
+
+	return 0;
+
+err:
+	if (master) {
+		xlator_destroy (master);
+	}
+
+	return -1;
+}
+
+
+static FILE *
+get_volfp (struct glfs *fs)
+{
+	int	     ret = 0;
+	cmd_args_t  *cmd_args = NULL;
+	FILE	    *specfp = NULL;
+	struct stat  statbuf;
+
+	cmd_args = &fs->ctx->cmd_args;
+
+	ret = lstat (cmd_args->volfile, &statbuf);
+	if (ret == -1) {
+		gf_log ("glfs", GF_LOG_ERROR,
+			"%s: %s", cmd_args->volfile, strerror (errno));
+		return NULL;
+	}
+
+	if ((specfp = fopen (cmd_args->volfile, "r")) == NULL) {
+		gf_log ("glfs", GF_LOG_ERROR,
+			"volume file %s: %s",
+			cmd_args->volfile,
+			strerror (errno));
+		return NULL;
+	}
+
+	gf_log ("glfs", GF_LOG_DEBUG,
+		"loading volume file %s", cmd_args->volfile);
+
+	return specfp;
+}
+
+
+int
+glfs_volumes_init (struct glfs *fs)
+{
+	FILE		   *fp = NULL;
+	cmd_args_t	   *cmd_args = NULL;
+	int		    ret = 0;
+
+	cmd_args = &fs->ctx->cmd_args;
+
+	if (!vol_assigned (cmd_args))
+		return -1;
+
+	if (cmd_args->volfile_server) {
+		ret = glfs_mgmt_init (fs);
+		goto out;
+	}
+
+	fp = get_volfp (fs);
+
+	if (!fp) {
+		gf_log ("glfs", GF_LOG_ERROR,
+			"Cannot reach volume specification file");
+		ret = -1;
+		goto out;
+	}
+
+	ret = glfs_process_volfp (fs, fp);
+	if (ret)
+		goto out;
+
+out:
+	return ret;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+
+
+int
+glfs_set_xlator_option (struct glfs *fs, const char *xlator, const char *key,
+			const char *value)
+{
+	xlator_cmdline_option_t *option = NULL;
+
+	option = GF_CALLOC (1, sizeof (*option),
+			    glfs_mt_xlator_cmdline_option_t);
+	if (!option)
+		goto enomem;
+
+	INIT_LIST_HEAD (&option->cmd_args);
+
+	option->volume = gf_strdup (xlator);
+	if (!option->volume)
+		goto enomem;
+	option->key = gf_strdup (key);
+	if (!option->key)
+		goto enomem;
+	option->value = gf_strdup (value);
+	if (!option->value)
+		goto enomem;
+
+	list_add (&option->cmd_args, &fs->ctx->cmd_args.xlator_options);
+
+	return 0;
+enomem:
+	errno = ENOMEM;
+
+	if (!option)
+		return -1;
+
+	GF_FREE (option->volume);
+	GF_FREE (option->key);
+	GF_FREE (option->value);
+	GF_FREE (option);
+
+	return -1;
+}
+
+int glfs_setfsuid (uid_t fsuid)
+{
+	return syncopctx_setfsuid (&fsuid);
+}
+
+int glfs_setfsgid (gid_t fsgid)
+{
+	return syncopctx_setfsgid (&fsgid);
+}
+
+int glfs_setfsgroups (size_t size, const gid_t *list)
+{
+	return syncopctx_setfsgroups(size, list);
+}
+
+struct glfs *
+glfs_from_glfd (struct glfs_fd *glfd)
+{
+	return glfd->fs;
+}
+
+
+struct glfs_fd *
+glfs_fd_new (struct glfs *fs)
+{
+	struct glfs_fd  *glfd = NULL;
+
+	glfd = GF_CALLOC (1, sizeof (*glfd), glfs_mt_glfs_fd_t);
+	if (!glfd)
+		return NULL;
+
+	glfd->fs = fs;
+
+	INIT_LIST_HEAD (&glfd->openfds);
+
+	return glfd;
+}
+
+
+void
+glfs_fd_bind (struct glfs_fd *glfd)
+{
+	struct glfs *fs = NULL;
+
+	fs = glfd->fs;
+
+	glfs_lock (fs);
+	{
+		list_add_tail (&glfd->openfds, &fs->openfds);
+	}
+	glfs_unlock (fs);
+}
+
+void
+glfs_fd_destroy (struct glfs_fd *glfd)
+{
+	if (!glfd)
+		return;
+
+	glfs_lock (glfd->fs);
+	{
+		list_del_init (&glfd->openfds);
+	}
+	glfs_unlock (glfd->fs);
+
+	if (glfd->fd)
+		fd_unref (glfd->fd);
+
+	GF_FREE (glfd->readdirbuf);
+
+	GF_FREE (glfd);
+}
+
+
+static void *
+glfs_poller (void *data)
+{
+	struct glfs  *fs = NULL;
+
+	fs = data;
+
+	event_dispatch (fs->ctx->event_pool);
+
+	return NULL;
+}
+
+
+struct glfs *
+glfs_new (const char *volname)
+{
+	struct glfs     *fs = NULL;
+	int              ret = -1;
+	glusterfs_ctx_t *ctx = NULL;
+
+	ctx = glusterfs_ctx_new ();
+	if (!ctx) {
+		return NULL;
+	}
+
+#ifdef DEBUG
+        gf_mem_acct_enable_set (ctx);
+#endif
+
+	/* first globals init, for gf_mem_acct_enable_set () */
+	ret = glusterfs_globals_init (ctx);
+	if (ret)
+		return NULL;
+
+	THIS->ctx = ctx;
+
+	/* then ctx_defaults_init, for xlator_mem_acct_init(THIS) */
+	ret = glusterfs_ctx_defaults_init (ctx);
+	if (ret)
+		return NULL;
+
+	fs = GF_CALLOC (1, sizeof (*fs), glfs_mt_glfs_t);
+	if (!fs)
+		return NULL;
+	fs->ctx = ctx;
+
+	glfs_set_logging (fs, "/dev/null", 0);
+
+	fs->ctx->cmd_args.volfile_id = gf_strdup (volname);
+
+	fs->volname = gf_strdup (volname);
+
+	pthread_mutex_init (&fs->mutex, NULL);
+	pthread_cond_init (&fs->cond, NULL);
+
+	INIT_LIST_HEAD (&fs->openfds);
+
+	return fs;
+}
+
+
+int
+glfs_set_volfile (struct glfs *fs, const char *volfile)
+{
+	cmd_args_t  *cmd_args = NULL;
+
+	cmd_args = &fs->ctx->cmd_args;
+
+	if (vol_assigned (cmd_args))
+		return -1;
+
+	cmd_args->volfile = gf_strdup (volfile);
+
+	return 0;
+}
+
+
+int
+glfs_set_volfile_server (struct glfs *fs, const char *transport,
+			 const char *host, int port)
+{
+	cmd_args_t  *cmd_args = NULL;
+
+	cmd_args = &fs->ctx->cmd_args;
+
+	if (vol_assigned (cmd_args))
+		return -1;
+
+	cmd_args->volfile_server = gf_strdup (host);
+	cmd_args->volfile_server_transport = gf_strdup (transport);
+	cmd_args->volfile_server_port = port;
+	cmd_args->max_connect_attempts = 2;
+
+	return 0;
+}
+
+
+int
+glfs_set_logging (struct glfs *fs, const char *logfile, int loglevel)
+{
+	int  ret = 0;
+        char *tmplog = NULL;
+
+        if (!logfile) {
+                ret = gf_set_log_file_path (&fs->ctx->cmd_args);
+                if (ret)
+                        goto out;
+                tmplog = fs->ctx->cmd_args.log_file;
+        } else {
+                tmplog = (char *)logfile;
+        }
+
+        ret = gf_log_init (fs->ctx, tmplog, NULL);
+        if (ret)
+                goto out;
+
+	if (loglevel >= 0)
+		gf_log_set_loglevel (loglevel);
+
+out:
+	return ret;
+}
+
+
+int
+glfs_init_wait (struct glfs *fs)
+{
+	int   ret = -1;
+
+	/* Always a top-down call, use glfs_lock() */
+	glfs_lock (fs);
+	{
+		while (!fs->init)
+			pthread_cond_wait (&fs->cond,
+					   &fs->mutex);
+		ret = fs->ret;
+		errno = fs->err;
+	}
+	glfs_unlock (fs);
+
+	return ret;
+}
+
+
+void
+glfs_init_done (struct glfs *fs, int ret)
+{
+	glfs_init_cbk init_cbk;
+
+	if (!fs) {
+		gf_log ("glfs", GF_LOG_ERROR,
+			"fs is NULL");
+		goto out;
+	}
+
+	init_cbk = fs->init_cbk;
+
+	/* Always a bottom-up call, use mutex_lock() */
+	pthread_mutex_lock (&fs->mutex);
+	{
+		fs->init = 1;
+		fs->ret = ret;
+		fs->err = errno;
+
+		if (!init_cbk)
+			pthread_cond_broadcast (&fs->cond);
+	}
+	pthread_mutex_unlock (&fs->mutex);
+
+	if (init_cbk)
+		init_cbk (fs, ret);
+out:
+	return;
+}
+
+
+int
+glfs_init_common (struct glfs *fs)
+{
+	int  ret = -1;
+
+	ret = create_master (fs);
+	if (ret)
+		return ret;
+
+	ret = gf_thread_create (&fs->poller, NULL, glfs_poller, fs);
+	if (ret)
+		return ret;
+
+	ret = glfs_volumes_init (fs);
+	if (ret)
+		return ret;
+
+	fs->dev_id = gf_dm_hashfn (fs->volname, strlen (fs->volname));
+	return ret;
+}
+
+
+int
+glfs_init_async (struct glfs *fs, glfs_init_cbk cbk)
+{
+	int  ret = -1;
+
+	fs->init_cbk = cbk;
+
+	ret = glfs_init_common (fs);
+
+	return ret;
+}
+
+
+int
+glfs_init (struct glfs *fs)
+{
+	int  ret = -1;
+
+	ret = glfs_init_common (fs);
+	if (ret)
+		return ret;
+
+	ret = glfs_init_wait (fs);
+
+	return ret;
+}
+
+
+int
+glfs_fini (struct glfs *fs)
+{
+        int             ret = -1;
+        int             countdown = 100;
+        xlator_t        *subvol = NULL;
+        glusterfs_ctx_t *ctx = NULL;
+        call_pool_t     *call_pool = NULL;
+
+        ctx = fs->ctx;
+
+        if (ctx->mgmt) {
+                rpc_clnt_disable (ctx->mgmt);
+                ctx->mgmt = NULL;
+        }
+
+        __glfs_entry_fs (fs);
+
+        call_pool = fs->ctx->pool;
+
+        while (countdown--) {
+                /* give some time for background frames to finish */
+                if (!call_pool->cnt)
+                        break;
+                usleep (100000);
+        }
+        /* leaked frames may exist, we ignore */
+
+        /*We deem glfs_fini as successful if there are no pending frames in the call
+         *pool*/
+        ret = (call_pool->cnt == 0)? 0: -1;
+
+        subvol = glfs_active_subvol (fs);
+        if (subvol) {
+                /* PARENT_DOWN within glfs_subvol_done() is issued only
+                   on graph switch (new graph should activiate and
+                   decrement the extra @winds count taken in glfs_graph_setup()
+
+                   Since we are explicitly destroying, PARENT_DOWN is necessary
+                */
+                xlator_notify (subvol, GF_EVENT_PARENT_DOWN, subvol, 0);
+                /* TBD: wait for CHILD_DOWN before exiting, in case of
+                   asynchronous cleanup like graceful socket disconnection
+                   in the future.
+                */
+        }
+
+        glfs_subvol_done (fs, subvol);
+
+        if (ctx->log.logfile)
+                fclose (ctx->log.logfile);
+
+        return ret;
+}
diff --git a/api/src/glfs.h b/api/src/glfs.h
new file mode 100644
index 000000000..18fda496e
--- /dev/null
+++ b/api/src/glfs.h
@@ -0,0 +1,581 @@
+/*
+  Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _GLFS_H
+#define _GLFS_H
+
+/*
+  Enforce the following flags as libgfapi is built
+  with them, and we want programs linking against them to also
+  be built with these flags. This is necessary as it affects
+  some of the structures defined in libc headers (like struct stat)
+  and those definitions need to be consistently compiled in
+  both the library and the application.
+*/
+
+#ifndef _FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 64
+#endif
+
+#ifndef __USE_FILE_OFFSET64
+#define __USE_FILE_OFFSET64
+#endif
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <sys/cdefs.h>
+#include <dirent.h>
+#include <sys/statvfs.h>
+
+__BEGIN_DECLS
+
+/* The filesystem object. One object per 'virtual mount' */
+struct glfs;
+typedef struct glfs glfs_t;
+
+
+/*
+  SYNOPSIS
+
+  glfs_new: Create a new 'virtual mount' object.
+
+  DESCRIPTION
+
+  This is most likely the very first function you will use. This function
+  will create a new glfs_t (virtual mount) object in memory.
+
+  On this newly created glfs_t, you need to be either set a volfile path
+  (glfs_set_volfile) or a volfile server (glfs_set_volfile_server).
+
+  The glfs_t object needs to be initialized with glfs_init() before you
+  can start issuing file operations on it.
+
+  PARAMETERS
+
+  @volname: Name of the volume. This identifies the server-side volume and
+            the fetched volfile (equivalent of --volfile-id command line
+	    parameter to glusterfsd). When used with glfs_set_volfile() the
+	    @volname has no effect (except for appearing in log messages).
+
+  RETURN VALUES
+
+  NULL   : Out of memory condition.
+  Others : Pointer to the newly created glfs_t virtual mount object.
+
+*/
+
+glfs_t *glfs_new (const char *volname);
+
+
+/*
+  SYNOPSIS
+
+  glfs_set_volfile: Specify the path to the volume specification file.
+
+  DESCRIPTION
+
+  If you are using a static volume specification file (without dynamic
+  volume management abilities from the CLI), then specify the path to
+  the volume specification file.
+
+  This is incompatible with glfs_set_volfile_server().
+
+  PARAMETERS
+
+  @fs: The 'virtual mount' object to be configured with the volume
+       specification file.
+
+  @volfile: Path to the locally available volume specification file.
+
+  RETURN VALUES
+
+   0 : Success.
+  -1 : Failure. @errno will be set with the type of failure.
+
+*/
+
+int glfs_set_volfile (glfs_t *fs, const char *volfile);
+
+
+/*
+  SYNOPSIS
+
+  glfs_set_volfile_server: Specify the address of management server.
+
+  DESCRIPTION
+
+  This function specifies the address of the management server (glusterd)
+  to connect, and establish the volume configuration. The @volname
+  parameter passed to glfs_new() is the volume which will be virtually
+  mounted as the glfs_t object. All operations performed by the CLI at
+  the management server will automatically be reflected in the 'virtual
+  mount' object as it maintains a connection to glusterd and polls on
+  configuration change notifications.
+
+  This is incompatible with glfs_set_volfile().
+
+  PARAMETERS
+
+  @fs: The 'virtual mount' object to be configured with the volume
+       specification file.
+
+  @transport: String specifying the transport used to connect to the
+              management daemon. Specifying NULL will result in the usage
+	      of the default (tcp) transport type. Permitted values
+	      are those what you specify as transport-type in a volume
+	      specification file (e.g "tcp", "rdma", "unix".)
+
+  @host: String specifying the address of where to find the management
+         daemon. Depending on the transport type this would either be
+	 an FQDN (e.g: "storage01.company.com"), ASCII encoded IP
+	 address "192.168.22.1", or a UNIX domain socket path (e.g
+	 "/tmp/glusterd.socket".)
+
+  @port: The TCP port number where gluster management daemon is listening.
+         Specifying 0 uses the default port number GF_DEFAULT_BASE_PORT.
+	 This parameter is unused if you are using a UNIX domain socket.
+
+  RETURN VALUES
+
+   0 : Success.
+  -1 : Failure. @errno will be set with the type of failure.
+
+*/
+
+int glfs_set_volfile_server (glfs_t *fs, const char *transport,
+			     const char *host, int port);
+
+
+/*
+  SYNOPSIS
+
+  glfs_set_logging: Specify logging parameters.
+
+  DESCRIPTION
+
+  This function specifies logging parameters for the virtual mount.
+  Default log file is /dev/null.
+
+  PARAMETERS
+
+  @fs: The 'virtual mount' object to be configured with the logging parameters.
+
+  @logfile: The logfile to be used for logging. Will be created if it does not
+            already exist (provided system permissions allow). If NULL, a new
+            logfile will be created in default log directory associated with
+            the glusterfs installation.
+
+  @loglevel: Numerical value specifying the degree of verbosity. Higher the
+             value, more verbose the logging.
+
+  RETURN VALUES
+
+   0 : Success.
+  -1 : Failure. @errno will be set with the type of failure.
+
+*/
+
+int glfs_set_logging (glfs_t *fs, const char *logfile, int loglevel);
+
+
+/*
+  SYNOPSIS
+
+  glfs_init: Initialize the 'virtual mount'
+
+  DESCRIPTION
+
+  This function initializes the glfs_t object. This consists of many steps:
+  - Spawn a poll-loop thread.
+  - Establish connection to management daemon and receive volume specification.
+  - Construct translator graph and initialize graph.
+  - Wait for initialization (connecting to all bricks) to complete.
+
+  PARAMETERS
+
+  @fs: The 'virtual mount' object to be initialized.
+
+  RETURN VALUES
+
+   0 : Success.
+  -1 : Failure. @errno will be set with the type of failure.
+
+*/
+
+int glfs_init (glfs_t *fs);
+
+
+/*
+  SYNOPSIS
+
+  glfs_fini: Cleanup and destroy the 'virtual mount'
+
+  DESCRIPTION
+
+  This function attempts to gracefully destroy glfs_t object. An attempt is
+  made to wait for all background processing to complete before returning.
+
+  glfs_fini() must be called after all operations on glfs_t is finished.
+
+  IMPORTANT
+
+  IT IS NECESSARY TO CALL glfs_fini() ON ALL THE INITIALIZED glfs_t
+  OBJECTS BEFORE TERMINATING THE PROGRAM. THERE MAY BE CACHED AND
+  UNWRITTEN / INCOMPLETE OPERATIONS STILL IN PROGRESS EVEN THOUGH THE
+  API CALLS HAVE RETURNED. glfs_fini() WILL WAIT FOR BACKGROUND OPERATIONS
+  TO COMPLETE BEFORE RETURNING, THEREBY MAKING IT SAFE FOR THE PROGRAM TO
+  EXIT.
+
+  PARAMETERS
+
+  @fs: The 'virtual mount' object to be destroyed.
+
+  RETURN VALUES
+
+   0 : Success.
+*/
+
+int glfs_fini (glfs_t *fs);
+
+/*
+ * FILE OPERATION
+ *
+ * What follows are filesystem operations performed on the
+ * 'virtual mount'. The calls here are kept as close to
+ * the POSIX system calls as possible.
+ *
+ * Notes:
+ *
+ * - All paths specified, even if absolute, are relative to the
+ *   root of the virtual mount and not the system root (/).
+ *
+ */
+
+/* The file descriptor object. One per open file/directory. */
+
+struct glfs_fd;
+typedef struct glfs_fd glfs_fd_t;
+
+/*
+ * PER THREAD IDENTITY MODIFIERS
+ *
+ * The following operations enable to set a per thread identity context
+ * for the glfs APIs to perform operations as. The calls here are kept as close
+ * to POSIX equivalents as possible.
+ *
+ * NOTES:
+ *
+ *  - setgroups is a per thread setting, hence this is named as fsgroups to be
+ *    close in naming to the fs(u/g)id APIs
+ *  - Typical mode of operation is to set the IDs as required, with the
+ *    supplementary groups being optionally set, make the glfs call and post the
+ *    glfs operation set them back to eu/gid or uid/gid as appropriate to the
+ *    caller
+ *  - The groups once set, need to be unset by setting the size to 0 (in which
+ *    case the list argument is a do not care)
+ *  - Once a process for a thread of operation choses to set the IDs, all glfs
+ *    calls made from that thread would default to the IDs set for the thread.
+ *    As a result use these APIs with care and ensure that the set IDs are
+ *    reverted to global process defaults as required.
+ *
+ */
+int glfs_setfsuid (uid_t fsuid);
+int glfs_setfsgid (gid_t fsgid);
+int glfs_setfsgroups (size_t size, const gid_t *list);
+
+/*
+  SYNOPSIS
+
+  glfs_open: Open a file.
+
+  DESCRIPTION
+
+  This function opens a file on a virtual mount.
+
+  PARAMETERS
+
+  @fs: The 'virtual mount' object to be initialized.
+
+  @path: Path of the file within the virtual mount.
+
+  @flags: Open flags. See open(2). O_CREAT is not supported.
+          Use glfs_creat() for creating files.
+
+  RETURN VALUES
+
+  NULL   : Failure. @errno will be set with the type of failure.
+  Others : Pointer to the opened glfs_fd_t.
+
+ */
+
+glfs_fd_t *glfs_open (glfs_t *fs, const char *path, int flags);
+
+
+/*
+  SYNOPSIS
+
+  glfs_creat: Create a file.
+
+  DESCRIPTION
+
+  This function opens a file on a virtual mount.
+
+  PARAMETERS
+
+  @fs: The 'virtual mount' object to be initialized.
+
+  @path: Path of the file within the virtual mount.
+
+  @mode: Permission of the file to be created.
+
+  @flags: Create flags. See open(2). O_EXCL is supported.
+
+  RETURN VALUES
+
+  NULL   : Failure. @errno will be set with the type of failure.
+  Others : Pointer to the opened glfs_fd_t.
+
+ */
+
+glfs_fd_t *glfs_creat (glfs_t *fs, const char *path, int flags,
+		       mode_t mode);
+
+int glfs_close (glfs_fd_t *fd);
+
+glfs_t *glfs_from_glfd (glfs_fd_t *fd);
+
+int glfs_set_xlator_option (glfs_t *fs, const char *xlator, const char *key,
+			    const char *value);
+
+/*
+
+  glfs_io_cbk
+
+  The following is the function type definition of the callback
+  function pointer which has to be provided by the caller to the
+  *_async() versions of the IO calls.
+
+  The callback function is called on completion of the requested
+  IO, and the appropriate return value is returned in @ret.
+
+  In case of an error in completing the IO, @ret will be -1 and
+  @errno will be set with the appropriate error.
+
+  @ret will be same as the return value of the non _async() variant
+  of the particular call
+
+  @data is the same context pointer provided by the caller at the
+  time of issuing the async IO call. This can be used by the
+  caller to differentiate different instances of the async requests
+  in a common callback function.
+*/
+
+typedef void (*glfs_io_cbk) (glfs_fd_t *fd, ssize_t ret, void *data);
+
+// glfs_{read,write}[_async]
+
+ssize_t glfs_read (glfs_fd_t *fd, void *buf, size_t count, int flags);
+ssize_t glfs_write (glfs_fd_t *fd, const void *buf, size_t count, int flags);
+int glfs_read_async (glfs_fd_t *fd, void *buf, size_t count, int flags,
+		     glfs_io_cbk fn, void *data);
+int glfs_write_async (glfs_fd_t *fd, const void *buf, size_t count, int flags,
+		      glfs_io_cbk fn, void *data);
+
+// glfs_{read,write}v[_async]
+
+ssize_t glfs_readv (glfs_fd_t *fd, const struct iovec *iov, int iovcnt,
+		    int flags);
+ssize_t glfs_writev (glfs_fd_t *fd, const struct iovec *iov, int iovcnt,
+		     int flags);
+int glfs_readv_async (glfs_fd_t *fd, const struct iovec *iov, int count,
+		      int flags, glfs_io_cbk fn, void *data);
+int glfs_writev_async (glfs_fd_t *fd, const struct iovec *iov, int count,
+		       int flags, glfs_io_cbk fn, void *data);
+
+// glfs_p{read,write}[_async]
+
+ssize_t glfs_pread (glfs_fd_t *fd, void *buf, size_t count, off_t offset,
+		    int flags);
+ssize_t glfs_pwrite (glfs_fd_t *fd, const void *buf, size_t count,
+		     off_t offset, int flags);
+int glfs_pread_async (glfs_fd_t *fd, void *buf, size_t count, off_t offset,
+		      int flags, glfs_io_cbk fn, void *data);
+int glfs_pwrite_async (glfs_fd_t *fd, const void *buf, int count, off_t offset,
+		       int flags, glfs_io_cbk fn, void *data);
+
+// glfs_p{read,write}v[_async]
+
+ssize_t glfs_preadv (glfs_fd_t *fd, const struct iovec *iov, int iovcnt,
+		     off_t offset, int flags);
+ssize_t glfs_pwritev (glfs_fd_t *fd, const struct iovec *iov, int iovcnt,
+		      off_t offset, int flags);
+int glfs_preadv_async (glfs_fd_t *fd, const struct iovec *iov, int count,
+		       off_t offset, int flags, glfs_io_cbk fn, void *data);
+int glfs_pwritev_async (glfs_fd_t *fd, const struct iovec *iov, int count,
+			off_t offset, int flags, glfs_io_cbk fn, void *data);
+
+
+off_t glfs_lseek (glfs_fd_t *fd, off_t offset, int whence);
+
+int glfs_truncate (glfs_t *fs, const char *path, off_t length);
+
+int glfs_ftruncate (glfs_fd_t *fd, off_t length);
+int glfs_ftruncate_async (glfs_fd_t *fd, off_t length, glfs_io_cbk fn,
+			  void *data);
+
+int glfs_lstat (glfs_t *fs, const char *path, struct stat *buf);
+int glfs_stat (glfs_t *fs, const char *path, struct stat *buf);
+int glfs_fstat (glfs_fd_t *fd, struct stat *buf);
+
+int glfs_fsync (glfs_fd_t *fd);
+int glfs_fsync_async (glfs_fd_t *fd, glfs_io_cbk fn, void *data);
+
+int glfs_fdatasync (glfs_fd_t *fd);
+int glfs_fdatasync_async (glfs_fd_t *fd, glfs_io_cbk fn, void *data);
+
+int glfs_access (glfs_t *fs, const char *path, int mode);
+
+int glfs_symlink (glfs_t *fs, const char *oldpath, const char *newpath);
+
+int glfs_readlink (glfs_t *fs, const char *path, char *buf, size_t bufsiz);
+
+int glfs_mknod (glfs_t *fs, const char *path, mode_t mode, dev_t dev);
+
+int glfs_mkdir (glfs_t *fs, const char *path, mode_t mode);
+
+int glfs_unlink (glfs_t *fs, const char *path);
+
+int glfs_rmdir (glfs_t *fs, const char *path);
+
+int glfs_rename (glfs_t *fs, const char *oldpath, const char *newpath);
+
+int glfs_link (glfs_t *fs, const char *oldpath, const char *newpath);
+
+glfs_fd_t *glfs_opendir (glfs_t *fs, const char *path);
+
+/*
+ * @glfs_readdir_r and @glfs_readdirplus_r ARE thread safe AND re-entrant,
+ * but the interface has ambiguity about the size of @dirent to be allocated
+ * before calling the APIs. 512 byte buffer (for @dirent) is sufficient for
+ * all known systems which are tested againt glusterfs/gfapi, but may be
+ * insufficient in the future.
+ */
+
+int glfs_readdir_r (glfs_fd_t *fd, struct dirent *dirent,
+		    struct dirent **result);
+
+int glfs_readdirplus_r (glfs_fd_t *fd, struct stat *stat, struct dirent *dirent,
+			struct dirent **result);
+
+/*
+ * @glfs_readdir and @glfs_readdirplus are NEITHER thread safe NOR re-entrant
+ * when called on the same directory handle. However they ARE thread safe
+ * AND re-entrant when called on different directory handles (which may be
+ * referring to the same directory too.)
+ */
+
+struct dirent *glfs_readdir (glfs_fd_t *fd);
+
+struct dirent *glfs_readdirplus (glfs_fd_t *fd, struct stat *stat);
+
+long glfs_telldir (glfs_fd_t *fd);
+
+void glfs_seekdir (glfs_fd_t *fd, long offset);
+
+int glfs_closedir (glfs_fd_t *fd);
+
+int glfs_statvfs (glfs_t *fs, const char *path, struct statvfs *buf);
+
+int glfs_chmod (glfs_t *fs, const char *path, mode_t mode);
+
+int glfs_fchmod (glfs_fd_t *fd, mode_t mode);
+
+int glfs_chown (glfs_t *fs, const char *path, uid_t uid, gid_t gid);
+
+int glfs_lchown (glfs_t *fs, const char *path, uid_t uid, gid_t gid);
+
+int glfs_fchown (glfs_fd_t *fd, uid_t uid, gid_t gid);
+
+int glfs_utimens (glfs_t *fs, const char *path, struct timespec times[2]);
+
+int glfs_lutimens (glfs_t *fs, const char *path, struct timespec times[2]);
+
+int glfs_futimens (glfs_fd_t *fd, struct timespec times[2]);
+
+ssize_t glfs_getxattr (glfs_t *fs, const char *path, const char *name,
+		       void *value, size_t size);
+
+ssize_t glfs_lgetxattr (glfs_t *fs, const char *path, const char *name,
+			void *value, size_t size);
+
+ssize_t glfs_fgetxattr (glfs_fd_t *fd, const char *name,
+			void *value, size_t size);
+
+ssize_t glfs_listxattr (glfs_t *fs, const char *path, void *value, size_t size);
+
+ssize_t glfs_llistxattr (glfs_t *fs, const char *path, void *value,
+			 size_t size);
+
+ssize_t glfs_flistxattr (glfs_fd_t *fd, void *value, size_t size);
+
+int glfs_setxattr (glfs_t *fs, const char *path, const char *name,
+		   const void *value, size_t size, int flags);
+
+int glfs_lsetxattr (glfs_t *fs, const char *path, const char *name,
+		    const void *value, size_t size, int flags);
+
+int glfs_fsetxattr (glfs_fd_t *fd, const char *name,
+		    const void *value, size_t size, int flags);
+
+int glfs_removexattr (glfs_t *fs, const char *path, const char *name);
+
+int glfs_lremovexattr (glfs_t *fs, const char *path, const char *name);
+
+int glfs_fremovexattr (glfs_fd_t *fd, const char *name);
+
+int glfs_fallocate(glfs_fd_t *fd, int keep_size, off_t offset, size_t len);
+
+int glfs_discard(glfs_fd_t *fd, off_t offset, size_t len);
+
+
+int glfs_discard_async (glfs_fd_t *fd, off_t length, size_t lent,
+			glfs_io_cbk fn, void *data);
+
+int glfs_zerofill(glfs_fd_t *fd, off_t offset, size_t len);
+
+int glfs_zerofill_async (glfs_fd_t *fd, off_t length, size_t len,
+                        glfs_io_cbk fn, void *data);
+
+char *glfs_getcwd (glfs_t *fs, char *buf, size_t size);
+
+int glfs_chdir (glfs_t *fs, const char *path);
+
+int glfs_fchdir (glfs_fd_t *fd);
+
+char *glfs_realpath (glfs_t *fs, const char *path, char *resolved_path);
+
+/*
+ * @cmd and @flock are as specified in man fcntl(2).
+ */
+int glfs_posix_lock (glfs_fd_t *fd, int cmd, struct flock *flock);
+
+glfs_fd_t *glfs_dup (glfs_fd_t *fd);
+
+__END_DECLS
+
+#endif /* !_GLFS_H */
diff --git a/argp-standalone/configure.ac b/argp-standalone/configure.ac
index 65ebc4518..2ecd2a801 100644
--- a/argp-standalone/configure.ac
+++ b/argp-standalone/configure.ac
@@ -8,7 +8,7 @@ AC_CONFIG_SRCDIR([argp-ba.c])
 AC_CONFIG_AUX_DIR([.])
 
 AM_INIT_AUTOMAKE
-AM_CONFIG_HEADER(config.h)
+AC_CONFIG_HEADERS(config.h)
 
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES(yes)])
 
@@ -22,7 +22,7 @@ AC_GNU_SOURCE
 AC_PROG_CC
 AC_PROG_MAKE_SET
 AC_PROG_RANLIB
-AM_PROG_CC_STDC
+AC_PROG_CC
 
 if test "x$am_cv_prog_cc_stdc" = xno ; then
   AC_ERROR([the C compiler doesn't handle ANSI-C])
diff --git a/autogen.sh b/autogen.sh
index e20408bf2..f937e6be0 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -1,8 +1,105 @@
 #!/bin/sh
 
-aclocal
-autoheader
-(libtoolize --automake --copy --force || glibtoolize --automake --copy --force)
-autoconf
-automake --add-missing --copy --foreign
+echo
+echo ... GlusterFS autogen ...
+echo
+
+## Check all dependencies are present
+MISSING=""
+
+# Check for aclocal
+env aclocal --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+  ACLOCAL=aclocal
+else
+  MISSING="$MISSING aclocal"
+fi
+
+# Check for autoconf
+env autoconf --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+  AUTOCONF=autoconf
+else
+  MISSING="$MISSING autoconf"
+fi
+
+# Check for autoheader
+env autoheader --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+  AUTOHEADER=autoheader
+else
+  MISSING="$MISSING autoheader"
+fi
+
+# Check for automake
+env automake --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+  AUTOMAKE=automake
+else
+  MISSING="$MISSING automake"
+fi
+
+# Check for libtoolize or glibtoolize
+env libtoolize --version > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+  # libtoolize was found, so use it
+  TOOL=libtoolize
+else
+  # libtoolize wasn't found, so check for glibtoolize
+  env glibtoolize --version > /dev/null 2>&1
+  if [ $? -eq 0 ]; then
+    TOOL=glibtoolize
+  else
+    MISSING="$MISSING libtoolize/glibtoolize"
+  fi
+fi
+
+# Check for tar
+env tar --version > /dev/null 2>&1
+if [ $? -ne 0 ]; then
+  MISSING="$MISSING tar"
+fi
+
+## If dependencies are missing, warn the user and abort
+if [ "x$MISSING" != "x" ]; then
+  echo "Aborting."
+  echo
+  echo "The following build tools are missing:"
+  echo
+  for pkg in $MISSING; do
+    echo "  * $pkg"
+  done
+  echo
+  echo "Please install them and try again."
+  echo
+  exit 1
+fi
+
+## generate gf-error-codes.h from error-codes.json
+echo "Generate gf-error-codes.h ..."
+if ./gen-headers.py; then
+    if ! mv -fv gf-error-codes.h libglusterfs/src/gf-error-codes.h; then
+	exit 1
+    fi
+else
+    exit 1
+fi
+
+## Do the autogeneration
+echo Running ${ACLOCAL}...
+$ACLOCAL -I ./contrib/aclocal
+echo Running ${AUTOHEADER}...
+$AUTOHEADER
+echo Running ${TOOL}...
+$TOOL --automake --copy --force
+echo Running ${AUTOCONF}...
+$AUTOCONF
+echo Running ${AUTOMAKE}...
+$AUTOMAKE --add-missing --copy --foreign
+
+# Run autogen in the argp-standalone sub-directory
 cd argp-standalone;./autogen.sh
+
+# Instruct user on next steps
+echo
+echo "Please proceed with configuring, compiling, and installing."
diff --git a/booster/Makefile.am b/booster/Makefile.am
deleted file mode 100644
index e1c45f305..000000000
--- a/booster/Makefile.am
+++ /dev/null
@@ -1 +0,0 @@
-SUBDIRS=src
-\ No newline at end of file
diff --git a/booster/src/Makefile.am b/booster/src/Makefile.am
deleted file mode 100644
index d7d83abf5..000000000
--- a/booster/src/Makefile.am
+++ /dev/null
@@ -1,21 +0,0 @@
-ldpreload_LTLIBRARIES = libglusterfs-booster.la
-ldpreloaddir = $(libdir)/glusterfs
-noinst_HEADERS = booster_fstab.h booster-fd.h
-libglusterfs_booster_la_SOURCES = booster.c booster_stat.c booster_fstab.c booster-fd.c
-libglusterfs_booster_la_CFLAGS = -I$(top_srcdir)/libglusterfsclient/src/ -D_GNU_SOURCE -D$(GF_HOST_OS) -fPIC -Wall \
-	-pthread $(GF_BOOSTER_CFLAGS) -shared -nostartfiles 
-libglusterfs_booster_la_CPPFLAGS = -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE  \
-	-I$(top_srcdir)/libglusterfsclient/src \
-	-I$(top_srcdir)/libglusterfs/src -DDATADIR=\"$(localstatedir)\" \
-	-DCONFDIR=\"$(sysconfdir)/glusterfs\" $(ARGP_STANDALONE_CPPFLAGS)
-
-libglusterfs_booster_la_LDFLAGS = -module -avoidversion
-libglusterfs_booster_la_LIBADD =  $(top_builddir)/libglusterfs/src/libglusterfs.la $(top_builddir)/libglusterfsclient/src/libglusterfsclient.la
-
-CLEANFILES =
-
-uninstall-local:
-	rm -f $(DESTDIR)$(ldpreloaddir)/glusterfs-booster.so
-
-install-data-hook:
-	ln -sf libglusterfs-booster.so $(DESTDIR)$(ldpreloaddir)/glusterfs-booster.so
diff --git a/booster/src/booster-fd.c b/booster/src/booster-fd.c
deleted file mode 100644
index fa5b0cde2..000000000
--- a/booster/src/booster-fd.c
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
-  Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-
-
-#include "booster-fd.h"
-#include <logging.h>
-#include <mem-pool.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <common-utils.h>
-#include <string.h>
-
-#include <assert.h>
-
-extern fd_t *
-fd_ref (fd_t *fd);
-
-extern void
-fd_unref (fd_t *fd);
-/*
-   Allocate in memory chunks of power of 2 starting from 1024B
-   Assumes fdtable->lock is held
-   */
-static inline uint
-gf_roundup_power_of_two (uint nr)
-{
-        uint result = 1;
-
-        if (nr < 0) {
-                gf_log ("booster-fd", GF_LOG_ERROR, "Negative number passed");
-                return -1;
-        }
-        
-        while (result <= nr)
-                result *= 2;
-
-        return result;
-}
-
-#define BOOSTER_NFDBITS (sizeof (unsigned long))
-
-#define BOOSTER_FDMASK(d)             (1UL << ((d) % BOOSTER_NFDBITS))
-#define BOOSTER_FDELT(d)              (d / BOOSTER_NFDBITS)
-#define BOOSTER_FD_SET(set, d)        (set->fd_bits[BOOSTER_FDELT(d)] |= BOOSTER_FDMASK(d))
-#define BOOSTER_FD_CLR(set, d)        (set->fd_bits[BOOSTER_FDELT(d)] &= ~BOOSTER_FDMASK(d))
-#define BOOSTER_FD_ISSET(set, d)      (set->fd_bits[BOOSTER_FDELT(d)] & BOOSTER_FDMASK(d))
-
-inline int
-booster_get_close_on_exec (booster_fdtable_t *fdtable, int fd)
-{
-        return BOOSTER_FD_ISSET(fdtable->close_on_exec, fd);
-}
-
-inline void
-booster_set_close_on_exec (booster_fdtable_t *fdtable, int fd)
-{
-        BOOSTER_FD_SET(fdtable->close_on_exec, fd);
-}
-
-int
-booster_fdtable_expand (booster_fdtable_t *fdtable, uint nr)
-{
-        fd_t    **oldfds = NULL, **tmp = NULL;
-        uint    oldmax_fds = -1;
-        uint    cpy = 0;
-        int32_t ret = -1, bytes = 0;
-        booster_fd_set_t *oldclose_on_exec = NULL;
-
-        if (fdtable == NULL || nr < 0) {
-                gf_log ("booster-fd", GF_LOG_ERROR, "Invalid argument");
-                errno = EINVAL;
-                ret = -1;
-                goto out;
-        }
-
-        nr /= (1024 / sizeof (fd_t *));
-        nr = gf_roundup_power_of_two (nr + 1);
-        nr *= (1024 / sizeof (fd_t *));
-
-        oldfds = fdtable->fds;
-        oldmax_fds = fdtable->max_fds;
-        oldclose_on_exec = fdtable->close_on_exec;
-
-        fdtable->fds = CALLOC (nr, sizeof (fd_t *));
-        if (fdtable->fds == NULL) {
-                gf_log ("booster-fd", GF_LOG_ERROR, "Memory allocation failed");
-                fdtable->fds = oldfds;
-                oldfds = NULL;
-                ret = -1;
-                goto out;
-        }
-
-        fdtable->max_fds = nr;
-
-        if (oldfds) {
-                cpy = oldmax_fds * sizeof (fd_t *);
-                memcpy (fdtable->fds, oldfds, cpy);
-        }
-
-        /* nr will be either less than 8 or a multiple of 8 */
-        bytes = nr/8;
-        bytes = bytes ? bytes : 1;
-        fdtable->close_on_exec = CALLOC (bytes, 1);
-        if (fdtable->close_on_exec == NULL) {
-                gf_log ("booster-fd", GF_LOG_ERROR, "Memory allocation "
-                        "failed");
-                tmp = fdtable->fds;
-                fdtable->fds = oldfds;
-                oldfds = tmp;
-                ret = -1;
-                goto out;
-        }
-
-        if (oldclose_on_exec != NULL) {
-                bytes = oldmax_fds/8;
-                cpy = bytes ? bytes : 1;
-                memcpy (fdtable->close_on_exec, oldclose_on_exec, cpy);
-        }
-        gf_log ("booster-fd", GF_LOG_TRACE, "FD-table expanded: Old: %d,New: %d"
-                , oldmax_fds, nr);
-        ret = 0;
-
-out:
-        FREE (oldfds);
-        FREE (oldclose_on_exec);
-
-        return ret;
-}
-
-booster_fdtable_t *
-booster_fdtable_alloc (void)
-{
-        booster_fdtable_t *fdtable = NULL;
-        int32_t            ret = -1;
-
-        fdtable = CALLOC (1, sizeof (*fdtable));
-        GF_VALIDATE_OR_GOTO ("booster-fd", fdtable, out);
-
-        LOCK_INIT (&fdtable->lock);
-
-        LOCK (&fdtable->lock);
-        {
-                ret = booster_fdtable_expand (fdtable, 0);
-        }
-        UNLOCK (&fdtable->lock);
-
-        if (ret == -1) {
-                gf_log ("booster-fd", GF_LOG_ERROR, "FD-table allocation "
-                        "failed");
-                FREE (fdtable);
-                fdtable = NULL;
-        }
-
-out:
-        return fdtable;
-}
-
-fd_t **
-__booster_fdtable_get_all_fds (booster_fdtable_t *fdtable, uint *count)
-{
-        fd_t **fds = NULL;
-
-        if (count == NULL)
-                goto out;
-
-        fds = fdtable->fds;
-        fdtable->fds = calloc (fdtable->max_fds, sizeof (fd_t *));
-        *count = fdtable->max_fds;
-
-out:
-        return fds;
-}
-
-fd_t **
-booster_fdtable_get_all_fds (booster_fdtable_t *fdtable, uint *count)
-{
-        fd_t **fds = NULL;
-        if (!fdtable)
-                return NULL;
-
-        LOCK (&fdtable->lock);
-        {
-                fds = __booster_fdtable_get_all_fds (fdtable, count);
-        }
-        UNLOCK (&fdtable->lock);
-
-        return fds;
-}
-
-void
-booster_fdtable_destroy (booster_fdtable_t *fdtable)
-{
-        fd_t                    *fd = NULL;
-        fd_t                    **fds = NULL;
-        uint                    fd_count = 0;
-        int                     i = 0;
-
-        if (!fdtable)
-                return;
-
-        LOCK (&fdtable->lock);
-        {
-                fds = __booster_fdtable_get_all_fds (fdtable, &fd_count);
-                FREE (fdtable->fds);
-        }
-        UNLOCK (&fdtable->lock);
-
-        if (!fds)
-                goto free_table;
-        
-        for (i = 0; i < fd_count; i++) {
-                fd = fds[i];
-                if (fd != NULL)
-                        fd_unref (fd);
-        }
-        FREE (fds);
-free_table:
-        LOCK_DESTROY (&fdtable->lock);
-        FREE (fdtable);
-}
-
-int
-booster_fd_unused_get (booster_fdtable_t *fdtable, fd_t *fdptr, int fd)
-{
-        int ret = -1;
-        int error = 0;
-
-        if (fdtable == NULL || fdptr == NULL || fd < 0) {
-                gf_log ("booster-fd", GF_LOG_ERROR, "invalid argument");
-                errno = EINVAL;
-                return -1;
-        }
-
-        gf_log ("booster-fd", GF_LOG_TRACE, "Requested fd: %d", fd);
-        LOCK (&fdtable->lock);
-        {
-                while (fdtable->max_fds < fd) {
-                        error = 0;
-                        error = booster_fdtable_expand (fdtable,
-                                                        fdtable->max_fds + 1);
-                        if (error) {
-                                gf_log ("booster-fd", GF_LOG_ERROR,
-                                        "Cannot expand fdtable:%s",
-                                        strerror (error));
-                                goto err;
-                        }
-                }
-
-                if (!fdtable->fds[fd]) {
-                        fdtable->fds[fd] = fdptr;
-                        fd_ref (fdptr);
-                        ret = fd;
-                } else
-                        gf_log ("booster-fd", GF_LOG_ERROR, "Cannot allocate fd"
-                                " %d (slot not empty in fdtable)", fd);
-        }
-err:
-        UNLOCK (&fdtable->lock);
-
-        return ret;
-}
-
-void
-booster_fd_put (booster_fdtable_t *fdtable, int fd)
-{
-        fd_t *fdptr = NULL;
-        if (fdtable == NULL || fd < 0) {
-                gf_log ("booster-fd", GF_LOG_ERROR, "invalid argument");
-                return;
-        }
-
-        gf_log ("booster-fd", GF_LOG_TRACE, "FD put: %d", fd);
-        if (!(fd < fdtable->max_fds)) {
-                gf_log ("booster-fd", GF_LOG_ERROR, "FD not in booster fd"
-                        " table");
-                return;
-        }
-
-        LOCK (&fdtable->lock);
-        {
-                fdptr = fdtable->fds[fd];
-                fdtable->fds[fd] = NULL;
-        }
-        UNLOCK (&fdtable->lock);
-
-        if (fdptr)
-                fd_unref (fdptr);
-}
-
-fd_t *
-booster_fdptr_get (booster_fdtable_t *fdtable, int fd)
-{
-        fd_t *fdptr = NULL;
-
-        if (fdtable == NULL || fd < 0) {
-                gf_log ("booster-fd", GF_LOG_ERROR, "invalid argument");
-                errno = EINVAL;
-                return NULL;
-        }
-
-        gf_log ("booster-fd", GF_LOG_TRACE, "FD ptr request: %d", fd);
-        if (!(fd < fdtable->max_fds)) {
-                gf_log ("booster-fd", GF_LOG_ERROR, "FD not in booster fd"
-                        " table");
-                errno = EINVAL;
-                return NULL;
-        }
-
-        LOCK (&fdtable->lock);
-        {
-                fdptr = fdtable->fds[fd];
-                if (fdptr)
-                        fd_ref (fdptr);
-        }
-        UNLOCK (&fdtable->lock);
-
-        return fdptr;
-}
-
-void
-booster_fdptr_put (fd_t *booster_fd)
-{
-        if (booster_fd)
-                fd_unref (booster_fd);
-}
diff --git a/booster/src/booster-fd.h b/booster/src/booster-fd.h
deleted file mode 100644
index 595a112bd..000000000
--- a/booster/src/booster-fd.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
-  Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _BOOSTER_FD_H
-#define _BOOSTER_FD_H
-
-#include <libglusterfsclient.h>
-#include <locking.h>
-#include <list.h>
-
-/* This struct must be updated if the fd_t in fd.h changes.
- * We cannot include those headers here because unistd.h, included
- * by glusterfs headers, conflicts with the syscall prototypes we
- * define for booster.
- */
-struct _fd {
-        pid_t             pid;
-	int32_t           flags;
-        int32_t           refcount;
-        struct list_head  inode_list;
-        struct _inode    *inode;
-        struct _dict     *ctx;
-        gf_lock_t         lock; /* used ONLY for manipulating
-                                   'struct _fd_ctx' array (_ctx).*/
-	struct _fd_ctx   *_ctx;
-};
-typedef struct _fd fd_t;
-
-struct _booster_fd_set {
-        unsigned long fd_bits[0];
-};
-typedef struct _booster_fd_set booster_fd_set_t;
-
-struct _booster_fdtable {
-        booster_fd_set_t *close_on_exec;
-        int               refcount;
-        unsigned int      max_fds;
-        gf_lock_t         lock;
-        fd_t            **fds;
-};
-typedef struct _booster_fdtable booster_fdtable_t;
-
-void
-booster_set_close_on_exec (booster_fdtable_t *fdtable, int fd);
-
-int
-booster_get_close_on_exec (booster_fdtable_t *fdtable, int fd);
-
-extern int
-booster_fd_unused_get (booster_fdtable_t *fdtable, fd_t *fdptr, int fd);
-
-extern void
-booster_fd_put (booster_fdtable_t *fdtable, int fd);
-
-extern fd_t *
-booster_fdptr_get (booster_fdtable_t *fdtable, int fd);
-
-extern void
-booster_fdptr_put (fd_t *fdptr);
-
-extern void
-booster_fdtable_destroy (booster_fdtable_t *fdtable);
-
-booster_fdtable_t *
-booster_fdtable_alloc (void);
-
-#endif /* #ifndef _BOOSTER_FD_H */
diff --git a/booster/src/booster.c b/booster/src/booster.c
deleted file mode 100644
index c34ec1146..000000000
--- a/booster/src/booster.c
+++ /dev/null
@@ -1,3172 +0,0 @@
-/*
-   Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <dlfcn.h>
-#include <sys/types.h>
-#include <sys/uio.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <libglusterfsclient.h>
-#include <list.h>
-#include <pthread.h>
-#include <sys/xattr.h>
-#include <string.h>
-#include <assert.h>
-#include <errno.h>
-#include <ctype.h>
-#include <logging.h>
-#include <utime.h>
-#include <dirent.h>
-#include <sys/statfs.h>
-#include <sys/statvfs.h>
-#include <fcntl.h>
-#include "booster-fd.h"
-
-#ifndef GF_UNIT_KB
-#define GF_UNIT_KB 1024
-#endif
-
-static pthread_mutex_t cwdlock   = PTHREAD_MUTEX_INITIALIZER;
-
-/* attr constructor registers this function with libc's
- * _init function as a function that must be called before
- * the main() of the program.
- */
-static void booster_lib_init (void) __attribute__((constructor));
-
-extern fd_t *
-fd_ref (fd_t *fd);
-
-extern void
-fd_unref (fd_t *fd);
-
-extern int pipe (int filedes[2]);
-/* We define these flags so that we can remove fcntl.h from the include path.
- * fcntl.h has certain defines and other lines of code that redirect the
- * application's open and open64 calls to the syscalls defined by
- * libc, for us, thats not a Good Thing (TM).
- */
-#ifndef GF_O_CREAT
-#define GF_O_CREAT      0x40
-#endif
-
-#ifndef GF_O_TRUNC
-#define GF_O_TRUNC      0x200
-#endif
-
-#ifndef GF_O_RDWR
-#define GF_O_RDWR       0x2
-#endif
-
-#ifndef GF_O_WRONLY
-#define GF_O_WRONLY     0x1
-#endif
-
-#ifndef UNIX_PATH_MAX
-#define UNIX_PATH_MAX 108
-#endif
-
-typedef enum {
-        BOOSTER_OPEN,
-        BOOSTER_CREAT
-} booster_op_t;
-
-struct _inode;
-struct _dict;
-
-ssize_t
-write (int fd, const void *buf, size_t count);
-
-/* open, open64, creat */
-static int (*real_open) (const char *pathname, int flags, ...);
-static int (*real_open64) (const char *pathname, int flags, ...);
-static int (*real_creat) (const char *pathname, mode_t mode);
-static int (*real_creat64) (const char *pathname, mode_t mode);
-
-/* read, readv, pread, pread64 */
-static ssize_t (*real_read) (int fd, void *buf, size_t count);
-static ssize_t (*real_readv) (int fd, const struct iovec *vector, int count);
-static ssize_t (*real_pread) (int fd, void *buf, size_t count,
-                              unsigned long offset);
-static ssize_t (*real_pread64) (int fd, void *buf, size_t count,
-                                uint64_t offset);
-
-/* write, writev, pwrite, pwrite64 */
-static ssize_t (*real_write) (int fd, const void *buf, size_t count);
-static ssize_t (*real_writev) (int fd, const struct iovec *vector, int count);
-static ssize_t (*real_pwrite) (int fd, const void *buf, size_t count,
-                               unsigned long offset);
-static ssize_t (*real_pwrite64) (int fd, const void *buf, size_t count,
-                                 uint64_t offset);
-
-/* lseek, llseek, lseek64 */
-static off_t (*real_lseek) (int fildes, unsigned long offset, int whence);
-static off_t (*real_lseek64) (int fildes, uint64_t offset, int whence);
-
-/* close */
-static int (*real_close) (int fd);
-
-/* dup dup2 */
-static int (*real_dup) (int fd);
-static int (*real_dup2) (int oldfd, int newfd);
-
-static pid_t (*real_fork) (void);
-static int (*real_mkdir) (const char *pathname, mode_t mode);
-static int (*real_rmdir) (const char *pathname);
-static int (*real_chmod) (const char *pathname, mode_t mode);
-static int (*real_chown) (const char *pathname, uid_t owner, gid_t group);
-static int (*real_fchmod) (int fd, mode_t mode);
-static int (*real_fchown) (int fd, uid_t, gid_t gid);
-static int (*real_fsync) (int fd);
-static int (*real_ftruncate) (int fd, off_t length);
-static int (*real_ftruncate64) (int fd, loff_t length);
-static int (*real_link) (const char *oldpath, const char *newname);
-static int (*real_rename) (const char *oldpath, const char *newpath);
-static int (*real_utimes) (const char *path, const struct timeval times[2]);
-static int (*real_utime) (const char *path, const struct utimbuf *buf);
-static int (*real_mknod) (const char *path, mode_t mode, dev_t dev);
-static int (*real_mkfifo) (const char *path, mode_t mode);
-static int (*real_unlink) (const char *path);
-static int (*real_symlink) (const char *oldpath, const char *newpath);
-static int (*real_readlink) (const char *path, char *buf, size_t bufsize);
-static char * (*real_realpath) (const char *path, char *resolved);
-static DIR * (*real_opendir) (const char *path);
-static struct dirent * (*real_readdir) (DIR *dir);
-static struct dirent64 * (*real_readdir64) (DIR *dir);
-static int (*real_readdir_r) (DIR *dir, struct dirent *entry,
-                              struct dirent **result);
-static int (*real_readdir64_r) (DIR *dir, struct dirent64 *entry,
-                                struct dirent64 **result);
-static int (*real_closedir) (DIR *dh);
-static int (*real___xstat) (int ver, const char *path, struct stat *buf);
-static int (*real___xstat64) (int ver, const char *path, struct stat64 *buf);
-static int (*real_stat) (const char *path, struct stat *buf);
-static int (*real_stat64) (const char *path, struct stat64 *buf);
-static int (*real___fxstat) (int ver, int fd, struct stat *buf);
-static int (*real___fxstat64) (int ver, int fd, struct stat64 *buf);
-static int (*real_fstat) (int fd, struct stat *buf);
-static int (*real_fstat64) (int fd , struct stat64 *buf);
-static int (*real___lxstat) (int ver, const char *path, struct stat *buf);
-static int (*real___lxstat64) (int ver, const char *path, struct stat64 *buf);
-static int (*real_lstat) (const char *path, struct stat *buf);
-static int (*real_lstat64) (const char *path, struct stat64 *buf);
-static int (*real_statfs) (const char *path, struct statfs *buf);
-static int (*real_statfs64) (const char *path, struct statfs64 *buf);
-static int (*real_statvfs) (const char *path, struct statvfs *buf);
-static int (*real_statvfs64) (const char *path, struct statvfs64 *buf);
-static ssize_t (*real_getxattr) (const char *path, const char *name,
-                                 void *value, size_t size);
-static ssize_t (*real_lgetxattr) (const char *path, const char *name,
-                                  void *value, size_t size);
-static int (*real_remove) (const char* path);
-static int (*real_lchown) (const char *path, uid_t owner, gid_t group);
-static void (*real_rewinddir) (DIR *dirp);
-static void (*real_seekdir) (DIR *dirp, off_t offset);
-static off_t (*real_telldir) (DIR *dirp);
-
-static ssize_t (*real_sendfile) (int out_fd, int in_fd, off_t *offset,
-                                 size_t count);
-static ssize_t (*real_sendfile64) (int out_fd, int in_fd, off_t *offset,
-                                   size_t count);
-static int (*real_fcntl) (int fd, int cmd, ...);
-static int (*real_chdir) (const char *path);
-static int (*real_fchdir) (int fd);  
-static char * (*real_getcwd) (char *buf, size_t size);
-static int (*real_truncate) (const char *path, off_t length);
-static int (*real_truncate64) (const char *path, loff_t length);
-static int (*real_setxattr) (const char *path, const char *name,
-                             const void *value, size_t size, int flags);
-static int (*real_lsetxattr) (const char *path, const char *name,
-                              const void *value, size_t size, int flags);
-static int (*real_fsetxattr) (int filedes, const char *name,
-                              const void *value, size_t size, int flags);
-
-
-#define RESOLVE(sym) do {                                       \
-                if (!real_##sym)                                \
-                        real_##sym = dlsym (RTLD_NEXT, #sym);   \
-        } while (0)
-
-/*TODO: set proper value */
-#define MOUNT_HASH_SIZE 256
-
-struct booster_mount {
-        dev_t st_dev;
-        glusterfs_handle_t handle;
-        struct list_head device_list;
-};
-typedef struct booster_mount booster_mount_t;
-
-static booster_fdtable_t *booster_fdtable = NULL;
-
-extern int booster_configure (char *confpath);
-/* This is dup'ed every time VMP open/creat wants a new fd.
- * This is needed so we occupy an entry in the process' file
- * table.
- */
-int process_piped_fd = -1;
-
-static int
-booster_get_process_fd ()
-{
-        return real_dup (process_piped_fd);
-}
-
-/* The following two define which file contains
- * the FSTAB configuration for VMP-based usage.
- */
-#define DEFAULT_BOOSTER_CONF    CONFDIR"/booster.conf"
-#define BOOSTER_CONF_ENV_VAR    "GLUSTERFS_BOOSTER_FSTAB"
-
-
-/* The following define which log file is used when
- * using the old mount point bypass approach.
- */
-#define BOOSTER_DEFAULT_LOG     CONFDIR"/booster.log"
-#define BOOSTER_LOG_ENV_VAR     "GLUSTERFS_BOOSTER_LOG"
-
-void
-do_open (int fd, const char *pathname, int flags, mode_t mode, booster_op_t op)
-{
-        char                   *specfile = NULL;
-        char                   *mount_point = NULL; 
-        int32_t                 size = 0;
-        int32_t                 ret = -1;
-        FILE                   *specfp = NULL;
-        glusterfs_file_t        fh = NULL;
-        char                   *logfile = NULL;
-        glusterfs_init_params_t iparams = {
-                .loglevel = "error",
-                .lookup_timeout = 600,
-                .stat_timeout = 600,
-        };
-      
-        gf_log ("booster", GF_LOG_DEBUG, "Opening using MPB: %s", pathname);
-        size = fgetxattr (fd, "user.glusterfs-booster-volfile", NULL, 0);
-        if (size == -1) {
-                gf_log ("booster", GF_LOG_ERROR, "Xattr "
-                        "user.glusterfs-booster-volfile not found: %s",
-                        strerror (errno));
-                goto out;
-        }
-		
-        specfile = calloc (1, size);
-        if (!specfile) {
-                gf_log ("booster", GF_LOG_ERROR, "Memory allocation failed");
-                goto out;
-        }
-
-        ret = fgetxattr (fd, "user.glusterfs-booster-volfile", specfile,
-                         size);
-        if (ret == -1) {
-                gf_log ("booster", GF_LOG_ERROR, "Xattr "
-                        "user.glusterfs-booster-volfile not found: %s",
-                        strerror (errno));
-                goto out;
-        }
-    
-        specfp = tmpfile ();
-        if (!specfp) {
-                gf_log ("booster", GF_LOG_ERROR, "Temp file creation failed"
-                        ": %s", strerror (errno));
-                goto out;
-        }
-
-        ret = fwrite (specfile, size, 1, specfp);
-        if (ret != 1) {
-                gf_log ("booster", GF_LOG_ERROR, "Failed to write volfile: %s",
-                        strerror (errno));
-                goto out;
-        }
-		
-        fseek (specfp, 0L, SEEK_SET);
-
-        size = fgetxattr (fd, "user.glusterfs-booster-mount", NULL, 0);
-        if (size == -1) {
-                gf_log ("booster", GF_LOG_ERROR, "Xattr "
-                        "user.glusterfs-booster-mount not found: %s",
-                        strerror (errno));
-                goto out;
-        }
-        
-        mount_point = calloc (size, sizeof (char));
-        if (!mount_point) {
-                gf_log ("booster", GF_LOG_ERROR, "Memory allocation failed");
-                goto out;
-        }
-	
-        ret = fgetxattr (fd, "user.glusterfs-booster-mount", mount_point, size);
-        if (ret == -1) {
-                gf_log ("booster", GF_LOG_ERROR, "Xattr "
-                        "user.glusterfs-booster-mount not found: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        logfile = getenv (BOOSTER_LOG_ENV_VAR);
-        if (logfile) {
-                if (strlen (logfile) > 0)
-                        iparams.logfile = strdup (logfile);
-                else
-                        iparams.logfile = strdup (BOOSTER_DEFAULT_LOG);
-        } else {
-                iparams.logfile = strdup (BOOSTER_DEFAULT_LOG);
-        }
-
-        gf_log ("booster", GF_LOG_TRACE, "Using log-file: %s", iparams.logfile);
-        iparams.specfp = specfp;
-
-        ret = glusterfs_mount (mount_point, &iparams);
-        if (ret == -1) {
-                if (errno != EEXIST) {
-                        gf_log ("booster", GF_LOG_ERROR, "Mount failed over"
-                                " glusterfs");
-                        goto out;
-                } else
-                        gf_log ("booster", GF_LOG_ERROR, "Already mounted");
-        }
-
-        switch (op) {
-        case BOOSTER_OPEN:
-                gf_log ("booster", GF_LOG_TRACE, "Booster open call");
-                fh = glusterfs_open (pathname, flags, mode);
-                break;
-
-        case BOOSTER_CREAT:
-                gf_log ("booster", GF_LOG_TRACE, "Booster create call");
-                fh = glusterfs_creat (pathname, mode);
-                break;
-        }
-
-        if (!fh) {
-                gf_log ("booster", GF_LOG_ERROR, "Error performing operation");
-                goto out;
-        }
-
-        if (booster_fd_unused_get (booster_fdtable, fh, fd) == -1) {
-                gf_log ("booster", GF_LOG_ERROR, "Failed to get unused FD");
-                goto out;
-        }
-        fh = NULL;
-
-out:
-        if (specfile) {
-                free (specfile);
-        }
-
-        if (specfp) {
-                fclose (specfp);
-        }
-
-        if (mount_point) {
-                free (mount_point);
-        }
-
-        if (fh) {
-                glusterfs_close (fh);
-        }
-
-        return;
-}
-
-int
-vmp_open (const char *pathname, int flags, ...)
-{
-        mode_t                  mode = 0;
-        int                     fd = -1;
-        glusterfs_file_t        fh = NULL;
-        va_list                 ap;
-
-        if (flags & GF_O_CREAT) {
-                va_start (ap, flags);
-                mode = va_arg (ap, mode_t);
-                va_end (ap);
-
-                fh = glusterfs_open (pathname, flags, mode);
-        }
-        else
-                fh = glusterfs_open (pathname, flags);
-
-        if (!fh) {
-                gf_log ("booster", GF_LOG_ERROR, "VMP open failed");
-                goto out;
-        }
-
-        fd = booster_get_process_fd ();
-        if (fd == -1) {
-                gf_log ("booster", GF_LOG_ERROR, "Failed to create open fd");
-                goto fh_close_out;
-        }
-
-        if (booster_fd_unused_get (booster_fdtable, fh, fd) == -1) {
-                gf_log ("booster", GF_LOG_ERROR, "Failed to map fd into table");
-                goto realfd_close_out;
-        }
-
-        return fd;
-
-realfd_close_out:
-        real_close (fd);
-        fd = -1;
-
-fh_close_out:
-        glusterfs_close (fh);
-
-out:
-        return fd;
-}
-
-#define BOOSTER_USE_OPEN64          1
-#define BOOSTER_DONT_USE_OPEN64     0
-
-int
-booster_open (const char *pathname, int use64, int flags, ...)
-{
-        int     ret = -1;
-        mode_t  mode = 0;
-        va_list ap;
-        int     (*my_open) (const char *pathname, int flags, ...);
-
-        if (!pathname) {
-                errno = EINVAL;
-                goto out;
-        }
-
-        gf_log ("booster", GF_LOG_TRACE, "Open: %s", pathname);
-        /* First try opening through the virtual mount point.
-         * The difference lies in the fact that:
-         * 1. We depend on libglusterfsclient library to perform
-         * the translation from the path to handle.
-         * 2. We do not go to the file system for the fd, instead
-         * we use booster_get_process_fd (), which returns a dup'ed
-         * fd of a pipe created in booster_init.
-         */
-        if (flags & GF_O_CREAT) {
-                va_start (ap, flags);
-                mode = va_arg (ap, mode_t);
-                va_end (ap);
-                ret = vmp_open (pathname, flags, mode);
-        }
-        else
-                ret = vmp_open (pathname, flags);
-
-        /* We receive an ENODEV if the VMP does not exist. If we
-         * receive an error other than ENODEV, it means, there
-         * actually was an error performing vmp_open. This must
-         * be returned to the user.
-         */
-        if ((ret < 0) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "Error in opening file over "
-                        " VMP: %s", strerror (errno));
-                goto out;
-        }
-
-        if (ret > 0) {
-                gf_log ("booster", GF_LOG_TRACE, "File opened");
-                goto out;
-        }
-
-        if (use64) {
-                gf_log ("booster", GF_LOG_TRACE, "Using 64-bit open");
-		my_open = real_open64;
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Using 32-bit open");
-		my_open = real_open;
-        }
-
-        /* It is possible the RESOLVE macro is not able
-         * to resolve the symbol of a function, in that case
-         * we dont want to seg-fault on calling a NULL functor.
-         */
-        if (my_open == NULL) {
-                gf_log ("booster", GF_LOG_ERROR, "open not resolved");
-                ret = -1;
-                errno = ENOSYS;
-                goto out;
-        }
-
-	if (flags & GF_O_CREAT) {
-		va_start (ap, flags);
-		mode = va_arg (ap, mode_t);
-		va_end (ap);
-
-                ret = my_open (pathname, flags, mode);
-	} else
-                ret = my_open (pathname, flags);
-
-        if (ret != -1) {
-                do_open (ret, pathname, flags, mode, BOOSTER_OPEN);
-        }
-
-out:
-        return ret;
-}
-
-/* This is done to over-write existing definitions of open and open64 inside
- * libc with our own copies. __REDIRECT is provided by libc.
- *
- * XXX: This will not work anywhere other than libc based systems.
- */
-int __REDIRECT (booster_false_open, (__const char *__file, int __oflag, ...),
-                open) __nonnull ((1));
-int __REDIRECT (booster_false_open64, (__const char *__file, int __oflag, ...),
-                open64) __nonnull ((1));
-int
-booster_false_open (const char *pathname, int flags, ...)
-{
-        int     ret;
-        mode_t  mode = 0;
-        va_list ap;
-
-        if (flags & GF_O_CREAT) {
-                va_start (ap, flags);
-                mode = va_arg (ap, mode_t);
-                va_end (ap);
-
-                ret = booster_open (pathname, BOOSTER_DONT_USE_OPEN64, flags,
-                                    mode);
-        }
-        else
-                ret = booster_open (pathname, BOOSTER_DONT_USE_OPEN64, flags);
-
-        return ret;
-}
-
-int
-booster_false_open64 (const char *pathname, int flags, ...)
-{
-        int     ret;
-        mode_t  mode = 0;
-        va_list ap;
-
-        if (flags & GF_O_CREAT) {
-                va_start (ap, flags);
-                mode = va_arg (ap, mode_t);
-                va_end (ap);
-
-                ret = booster_open (pathname, BOOSTER_USE_OPEN64, flags, mode);
-        }
-        else
-                ret = booster_open (pathname, BOOSTER_USE_OPEN64, flags);
-
-        return ret;
-}
-
-int
-vmp_creat (const char *pathname, mode_t mode)
-{
-        int                     fd = -1;
-        glusterfs_file_t        fh = NULL;
-
-        fh = glusterfs_creat (pathname, mode);
-        if (!fh) {
-                gf_log ("booster", GF_LOG_ERROR, "Create failed: %s: %s",
-                        pathname, strerror (errno));
-                goto out;
-        }
-
-        fd = booster_get_process_fd ();
-        if (fd == -1) {
-                gf_log ("booster", GF_LOG_ERROR, "Failed to create fd");
-                goto close_out;
-        }
-
-        if ((booster_fd_unused_get (booster_fdtable, fh, fd)) == -1) {
-                gf_log ("booster", GF_LOG_ERROR, "Failed to map unused fd");
-                goto real_close_out;
-        }
-
-        return fd;
-
-real_close_out:
-        real_close (fd);
-        fd = -1;
-
-close_out:
-        glusterfs_close (fh);
-
-out:
-        return -1;
-}
-
-int __REDIRECT (booster_false_creat, (const char *pathname, mode_t mode),
-                creat) __nonnull ((1));
-int __REDIRECT (booster_false_creat64, (const char *pathname, mode_t mode),
-                creat64) __nonnull ((1));
-
-int
-booster_false_creat (const char *pathname, mode_t mode)
-{
-        int     ret = -1;
-        if (!pathname) {
-                errno = EINVAL;
-                goto out;
-        }
-
-        gf_log ("booster", GF_LOG_TRACE, "Create: %s", pathname);
-        ret = vmp_creat (pathname, mode);
-
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "VMP create failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret > 0) {
-                gf_log ("booster", GF_LOG_TRACE, "File created");
-                goto out;
-        }
-
-        if (real_creat == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-        ret = real_creat (pathname, mode);
-
-        if (ret != -1) {
-                do_open (ret, pathname, GF_O_WRONLY | GF_O_TRUNC, mode,
-                         BOOSTER_CREAT);
-        } else
-                gf_log ("booster", GF_LOG_ERROR, "real create failed: %s",
-                        strerror (errno));
-
-out:
-        return ret;
-}
-
-
-int
-booster_false_creat64 (const char *pathname, mode_t mode)
-{
-        int     ret = -1;
-        if (!pathname) {
-                errno = EINVAL;
-                goto out;
-        }
-
-        gf_log ("booster", GF_LOG_TRACE, "Create: %s", pathname);
-        ret = vmp_creat (pathname, mode);
-
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "VMP create failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret > 0) {
-                gf_log ("booster", GF_LOG_TRACE, "File created");
-                goto out;
-        }
-
-        if (real_creat64 == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-        ret = real_creat64 (pathname, mode);
-
-        if (ret != -1) {
-                do_open (ret, pathname, GF_O_WRONLY | GF_O_TRUNC, mode,
-                         BOOSTER_CREAT);
-        } else
-                gf_log ("booster", GF_LOG_ERROR, "real create failed: %s",
-                        strerror (errno));
-
-out:
-        return ret;
-}
-
-
-/* pread */
-
-ssize_t
-pread (int fd, void *buf, size_t count, unsigned long offset)
-{
-        ssize_t ret;
-        glusterfs_file_t glfs_fd = 0;
-
-        gf_log ("booster", GF_LOG_TRACE, "pread: fd %d, count %lu, offset %lu"
-                ,fd, (long unsigned)count, offset);
-        glfs_fd = booster_fdptr_get (booster_fdtable, fd);
-        if (!glfs_fd) { 
-                gf_log ("booster", GF_LOG_TRACE, "Not booster fd");
-                if (real_pread == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_pread (fd, buf, count, offset);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_pread (glfs_fd, buf, count, offset);
-                booster_fdptr_put (glfs_fd);
-        }
-
-        return ret;
-}
-
-
-ssize_t
-pread64 (int fd, void *buf, size_t count, uint64_t offset)
-{
-        ssize_t ret;
-        glusterfs_file_t glfs_fd = 0;
-
-        gf_log ("booster", GF_LOG_TRACE, "pread64: fd %d, count %lu, offset %"
-                PRIu64, fd, (long unsigned)count, offset);
-        glfs_fd = booster_fdptr_get (booster_fdtable, fd);
-        if (!glfs_fd) { 
-                gf_log ("booster", GF_LOG_TRACE, "Not booster fd");
-                if (real_pread64 == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_pread64 (fd, buf, count, offset);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_pread (glfs_fd, buf, count, offset);
-                booster_fdptr_put (glfs_fd);
-        }
-
-        return ret;
-}
-
-
-ssize_t
-read (int fd, void *buf, size_t count)
-{
-        int ret;
-        glusterfs_file_t glfs_fd;
-
-        gf_log ("booster", GF_LOG_TRACE, "read: fd %d, count %lu", fd,
-                (long unsigned)count);
-        glfs_fd = booster_fdptr_get (booster_fdtable, fd);
-        if (!glfs_fd) {
-                gf_log ("booster", GF_LOG_TRACE, "Not booster fd");
-                if (real_read == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_read (fd, buf, count);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_read (glfs_fd, buf, count);
-                booster_fdptr_put (glfs_fd);
-        }
-
-        return ret;
-}
-
-
-ssize_t
-readv (int fd, const struct iovec *vector, int count)
-{
-        int ret;
-        glusterfs_file_t glfs_fd = 0;
-
-        gf_log ("booster", GF_LOG_TRACE, "readv: fd %d, iovecs %d", fd, count);
-        glfs_fd = booster_fdptr_get (booster_fdtable, fd);
-        if (!glfs_fd) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_readv == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_readv (fd, vector, count);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-		ret = glusterfs_readv (glfs_fd, vector, count);
-                booster_fdptr_put (glfs_fd);
-        }
-
-        return ret;
-}
-
-
-ssize_t
-write (int fd, const void *buf, size_t count)
-{
-        int ret;
-        glusterfs_file_t glfs_fd = 0;
-
-        gf_log ("booster", GF_LOG_TRACE, "write: fd %d, count %"GF_PRI_SIZET,
-                fd, count);
-
-        glfs_fd = booster_fdptr_get (booster_fdtable, fd);
-
-        if (!glfs_fd) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_write == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_write (fd, buf, count);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_write (glfs_fd, buf, count);
-                booster_fdptr_put (glfs_fd);
-        }
- 
-        return ret;
-}
-
-ssize_t
-writev (int fd, const struct iovec *vector, int count)
-{
-        int ret = 0;
-        glusterfs_file_t glfs_fd = 0; 
-
-        gf_log ("booster", GF_LOG_TRACE, "writev: fd %d, iovecs %d", fd, count);
-        glfs_fd = booster_fdptr_get (booster_fdtable, fd);
-
-        if (!glfs_fd) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_writev == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_writev (fd, vector, count);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_writev (glfs_fd, vector, count);
-                booster_fdptr_put (glfs_fd);
-        }
-
-        return ret;
-}
-
-
-ssize_t
-pwrite (int fd, const void *buf, size_t count, unsigned long offset)
-{
-        int ret;
-        glusterfs_file_t glfs_fd = 0;
-
-        gf_log ("booster", GF_LOG_TRACE, "pwrite: fd %d, count %"GF_PRI_SIZET
-                ", offset %lu", fd, count, offset);
-        glfs_fd = booster_fdptr_get (booster_fdtable, fd);
-
-        if (!glfs_fd) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_pwrite == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_pwrite (fd, buf, count, offset);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_pwrite (glfs_fd, buf, count, offset);
-                booster_fdptr_put (glfs_fd);
-        }
-
-        return ret;
-}
-
-
-ssize_t
-pwrite64 (int fd, const void *buf, size_t count, uint64_t offset)
-{
-        int ret;
-        glusterfs_file_t glfs_fd = 0;
-
-        gf_log ("booster", GF_LOG_TRACE, "pwrite64: fd %d, count %"GF_PRI_SIZET
-                ", offset %"PRIu64, fd, count, offset);
-        glfs_fd = booster_fdptr_get (booster_fdtable, fd);
-
-        if (!glfs_fd) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_pwrite64 == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_pwrite64 (fd, buf, count, offset);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_pwrite (glfs_fd, buf, count, offset);
-                booster_fdptr_put (glfs_fd);
-        }
-
-        return ret;
-}
-
-
-int
-close (int fd)
-{
-        int ret = -1;
-        glusterfs_file_t glfs_fd = 0;
-
-        gf_log ("booster", GF_LOG_TRACE, "close: fd %d", fd);
-	glfs_fd = booster_fdptr_get (booster_fdtable, fd);
-    
-	if (glfs_fd) {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-		booster_fd_put (booster_fdtable, fd);
-		ret = glusterfs_close (glfs_fd);
-		booster_fdptr_put (glfs_fd);
-	}
-
-        ret = real_close (fd);
-
-        return ret;
-}
-
-#ifndef _LSEEK_DECLARED
-#define _LSEEK_DECLARED
-off_t
-lseek (int filedes, unsigned long offset, int whence)
-{
-        int ret;
-        glusterfs_file_t glfs_fd = 0;
-
-        gf_log ("booster", GF_LOG_TRACE, "lseek: fd %d, offset %ld",
-                filedes, offset);
-
-        glfs_fd = booster_fdptr_get (booster_fdtable, filedes);
-        if (glfs_fd) {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_lseek (glfs_fd, offset, whence);
-                booster_fdptr_put (glfs_fd);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_lseek == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_lseek (filedes, offset, whence);
-        }
-
-        return ret;
-}
-#endif
-
-off_t
-lseek64 (int filedes, uint64_t offset, int whence)
-{
-        int ret;
-        glusterfs_file_t glfs_fd = 0;
-
-
-        gf_log ("booster", GF_LOG_TRACE, "lseek: fd %d, offset %"PRIu64,
-                filedes, offset);
-        glfs_fd = booster_fdptr_get (booster_fdtable, filedes);
-        if (glfs_fd) {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_lseek (glfs_fd, offset, whence);
-                booster_fdptr_put (glfs_fd);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_lseek64 == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_lseek64 (filedes, offset, whence);
-        }
-
-        return ret;
-}
-
-int 
-dup (int oldfd)
-{
-        int ret = -1, new_fd = -1;
-        glusterfs_file_t glfs_fd = 0;
-
-        gf_log ("booster", GF_LOG_TRACE, "dup: fd %d", oldfd);
-        glfs_fd = booster_fdptr_get (booster_fdtable, oldfd);
-        new_fd = real_dup (oldfd);
-
-        if (new_fd >=0 && glfs_fd) {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = booster_fd_unused_get (booster_fdtable, glfs_fd,
-                                             new_fd);
-                fd_ref ((fd_t *)glfs_fd);
-                if (ret == -1) {
-                        gf_log ("booster", GF_LOG_ERROR,"Failed to map new fd");
-                        real_close (new_fd);
-                } 
-        }
-
-        if (glfs_fd) {
-                booster_fdptr_put (glfs_fd);
-        }
-
-        return new_fd;
-}
-
-
-int 
-dup2 (int oldfd, int newfd)
-{
-        int ret = -1;
-        glusterfs_file_t old_glfs_fd = NULL, new_glfs_fd = NULL;
-
-        if (oldfd == newfd) {
-                return newfd;
-        }
-
-        old_glfs_fd = booster_fdptr_get (booster_fdtable, oldfd);
-        new_glfs_fd = booster_fdptr_get (booster_fdtable, newfd);
- 
-        ret = real_dup2 (oldfd, newfd); 
-        if (ret >= 0) {
-                if (new_glfs_fd) {
-                        glusterfs_close (new_glfs_fd);
-                        booster_fdptr_put (new_glfs_fd);
-                        booster_fd_put (booster_fdtable, newfd);
-                        new_glfs_fd = 0;
-                }
-
-                if (old_glfs_fd) {
-                        ret = booster_fd_unused_get (booster_fdtable,
-                                                     old_glfs_fd, newfd);
-                        fd_ref ((fd_t *)old_glfs_fd);
-                        if (ret == -1) {
-                                real_close (newfd);
-                        }
-                }
-        } 
-
-        if (old_glfs_fd) {
-                booster_fdptr_put (old_glfs_fd);
-        }
-
-        if (new_glfs_fd) {
-                booster_fdptr_put (new_glfs_fd);
-        }
-
-        return ret;
-}
-
-int
-mkdir (const char *pathname, mode_t mode)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "mkdir: path %s", pathname);
-        ret = glusterfs_mkdir (pathname, mode);
-
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "mkdir failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "directory created");
-                return ret;
-        }
-
-        if (real_mkdir == NULL) {
-                ret = -1;
-                errno = ENOSYS;
-        } else
-                ret = real_mkdir (pathname, mode);
-
-        return ret;
-}
-
-int
-rmdir (const char *pathname)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "rmdir: path %s", pathname);
-        ret = glusterfs_rmdir (pathname);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "rmdir failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "directory removed");
-                return ret;
-        }
-
-        if (real_rmdir == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_rmdir (pathname);
-
-        return ret;
-}
-
-int
-chmod (const char *pathname, mode_t mode)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "chmod: path %s", pathname);
-        ret = glusterfs_chmod (pathname, mode);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "chmod failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "chmod succeeded");
-                return ret;
-        }
-
-        if (real_chmod == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_chmod (pathname, mode);
-
-        return ret;
-}
-
-int
-chown (const char *pathname, uid_t owner, gid_t group)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "chown: path: %s", pathname);
-        ret = glusterfs_chown (pathname, owner, group);
-
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "chown failed: %s\n",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "chown succeeded");
-                return ret;
-        }
-
-        if (real_chown == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_chown (pathname, owner, group);
-
-        return ret;
-}
-
-int
-fchown (int fd, uid_t owner, gid_t group)
-{
-        int                     ret = -1;
-        glusterfs_file_t        fh = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "fchown: fd %d, uid %d, gid %d", fd,
-                owner, group);
-        fh = booster_fdptr_get (booster_fdtable, fd);
-        if (!fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_fchown == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_fchown (fd, owner, group);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_fchown (fh, owner, group);
-                booster_fdptr_put (fh);
-        }
-
-        return ret;
-}
-
-
-#define MOUNT_TABLE_HASH_SIZE 256
-
-
-static void booster_cleanup (void);
-static int 
-booster_init (void)
-{
-        char    *booster_conf_path = NULL;
-        int     ret = -1;
-        int     pipefd[2];
-
-        booster_fdtable = booster_fdtable_alloc ();
-        if (!booster_fdtable) {
-                fprintf (stderr, "cannot allocate fdtable: %s\n",
-                         strerror (errno));
-		goto err;
-        }
- 
-        if (pipe (pipefd) == -1) {
-                gf_log ("booster-fstab", GF_LOG_ERROR, "Pipe creation failed:%s"
-                        , strerror (errno));
-                goto err;
-        }
-
-        process_piped_fd = pipefd[0];
-        real_close (pipefd[1]);
-        /* libglusterfsclient based VMPs should be inited only
-         * after the file tables are inited so that if the socket
-         * calls use the fd based syscalls, the fd tables are
-         * correctly initialized to return a NULL handle, on which the
-         * socket calls will fall-back to the real API.
-         */
-        booster_conf_path = getenv (BOOSTER_CONF_ENV_VAR);
-        if (booster_conf_path != NULL) {
-                if (strlen (booster_conf_path) > 0)
-                        ret = booster_configure (booster_conf_path);
-                else {
-                        gf_log ("booster", GF_LOG_ERROR, "%s not defined, "
-                                "using default path: %s", BOOSTER_CONF_ENV_VAR,
-                                DEFAULT_BOOSTER_CONF);
-                        ret = booster_configure (DEFAULT_BOOSTER_CONF);
-                }
-        } else {
-                gf_log ("booster", GF_LOG_ERROR, "%s not defined, using default"
-                        " path: %s", BOOSTER_CONF_ENV_VAR,DEFAULT_BOOSTER_CONF);
-                ret = booster_configure (DEFAULT_BOOSTER_CONF);
-        }
-
-        atexit (booster_cleanup);
-        if (ret == 0)
-                gf_log ("booster", GF_LOG_DEBUG, "booster is inited");
-	return 0;
-
-err:
-        /* Sure we return an error value here
-         * but who cares about booster.
-         */
-	return -1; 
-}
-
-
-static void
-booster_cleanup (void)
-{
-        /* Ideally, we should be de-initing the fd-table
-         * here but the problem is that I've seen file accesses through booster
-         * continuing while the atexit registered function is called. That means
-         * , we cannot dealloc the fd-table since then there could be a crash
-         * while trying to determine whether a given fd is for libc or for
-         * libglusterfsclient.
-         * We should be satisfied with having cleaned up glusterfs contexts.
-         */
-        glusterfs_umount_all ();
-	glusterfs_reset ();
-}
-
-int
-fchmod (int fd, mode_t mode)
-{
-        int                     ret = -1;
-        glusterfs_file_t        fh = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "fchmod: fd %d, mode: 0x%x", fd, mode);
-        fh = booster_fdptr_get (booster_fdtable, fd);
-        if (!fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_fchmod == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_fchmod (fd, mode);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_fchmod (fh, mode);
-                booster_fdptr_put (fh);
-        }
-
-        return ret;
-}
-
-int
-fsync (int fd)
-{
-        int                     ret = -1;
-        glusterfs_file_t        fh = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "fsync: fd %d", fd);
-        fh = booster_fdptr_get (booster_fdtable, fd);
-        if (!fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_fsync == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_fsync (fd);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_fsync (fh);
-                booster_fdptr_put (fh);
-        }
-
-        return ret;
-}
-
-int __REDIRECT (booster_false_ftruncate, (int fd, off_t length),
-                ftruncate);
-int __REDIRECT (booster_false_ftruncate64, (int fd, loff_t length),
-                ftruncate64);
-
-int
-booster_false_ftruncate (int fd, off_t length)
-{
-        int                     ret = -1;
-        glusterfs_file_t        fh = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "ftruncate: fd %d, length: %"PRIu64,fd
-                , length);
-        fh = booster_fdptr_get (booster_fdtable, fd);
-        if (!fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_ftruncate == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_ftruncate (fd, length);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_ftruncate (fh, length);
-                booster_fdptr_put (fh);
-        }
-
-        return ret;
-}
-
-int
-booster_false_ftruncate64 (int fd, loff_t length)
-{
-        int                     ret = -1;
-        glusterfs_file_t        fh = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "ftruncate: fd %d, length: %"PRIu64,fd
-                , length);
-        fh = booster_fdptr_get (booster_fdtable, fd);
-        if (!fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_ftruncate == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_ftruncate64 (fd, length);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_ftruncate (fh, length);
-                booster_fdptr_put (fh);
-        }
-
-        return ret;
-}
-
-int
-link (const char *old, const char *new)
-{
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "link: old: %s, new: %s", old, new);
-        ret = glusterfs_link (old, new);
-
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "Link failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "link call succeeded");
-                return ret;
-        }
-
-        if (real_link == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_link (old, new);
-
-        return ret;
-}
-
-int
-rename (const char *old, const char *new)
-{
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "link: old: %s, new: %s", old, new);
-        ret = glusterfs_rename (old, new);
-
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "Rename failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "Rename succeeded");
-                return ret;
-        }
-
-        if (real_rename == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_rename (old, new);
-
-        return ret;
-}
-
-int
-utimes (const char *path, const struct timeval times[2])
-{
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "utimes: path %s", path);
-        ret = glusterfs_utimes (path, times);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "utimes failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "utimes succeeded");
-                return ret;
-        }
-
-        if (real_utimes == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_utimes (path, times);
-
-        return ret;
-}
-
-int
-utime (const char *path, const struct utimbuf *buf)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "utime: path %s", path);
-        ret = glusterfs_utime (path, buf);
-
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "utime failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "utime succeeded");
-                return ret;
-        }
-
-        if (real_utime == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_utime (path, buf);
-
-        return ret;
-}
-
-int
-mknod (const char *path, mode_t mode, dev_t dev)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "mknod: path %s", path);
-        ret = glusterfs_mknod (path, mode, dev);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "mknod failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "mknod succeeded");
-                return ret;
-        }
-
-        if (real_mknod) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_mknod (path, mode, dev);
-
-        return ret;
-}
-
-int
-mkfifo (const  char *path, mode_t mode)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "mkfifo: path %s", path);
-        ret = glusterfs_mkfifo (path, mode);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "mkfifo failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "mkfifo succeeded");
-                return ret;
-        }
-
-        if (real_mkfifo == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_mkfifo (path, mode);
-
-        return ret;
-}
-
-int
-unlink (const char *path)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "unlink: path %s", path);
-        ret = glusterfs_unlink (path);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "unlink failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "unlink succeeded");
-                return ret;
-        }
-
-        if (real_unlink == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_unlink (path);
-
-        return ret;
-}
-
-int
-symlink (const char *oldpath, const char *newpath)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "symlink: old: %s, new: %s",
-                oldpath, newpath);
-        ret = glusterfs_symlink (oldpath, newpath);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "symlink failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "symlink succeeded");
-                return ret;
-        }
-
-        if (real_symlink == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_symlink (oldpath, newpath);
-
-        return ret;
-}
-
-int
-readlink (const char *path, char *buf, size_t bufsize)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "readlink: path %s", path);
-        ret = glusterfs_readlink (path, buf, bufsize);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "readlink failed: %s",
-                        strerror (errno));
-                return ret;
-        }
-
-        if (ret > 0) {
-                gf_log ("booster", GF_LOG_TRACE, "readlink succeeded");
-                return ret;
-        }
-
-        if (real_readlink == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-        } else
-                ret = real_readlink (path, buf, bufsize);
-
-        return ret;
-}
-
-char *
-realpath (const char *path, char *resolved_path)
-{
-        char    *res = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "realpath: path %s", path);
-        res = glusterfs_realpath (path, resolved_path);
-        if ((res == NULL) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "realpath failed: %s",
-                        strerror (errno));
-                return res;
-        }
-
-        if (res != NULL) {
-                gf_log ("booster", GF_LOG_TRACE, "realpath succeeded");
-                return res;
-        }
-
-        if (real_realpath == NULL) {
-                errno = ENOSYS;
-                res = NULL;
-        } else
-                res = real_realpath (path, resolved_path);
-
-        return res;
-}
-
-#define BOOSTER_GL_DIR          1
-#define BOOSTER_POSIX_DIR       2
-
-struct booster_dir_handle {
-        int type;
-        void *dirh;
-};
-
-DIR *
-opendir (const char *path)
-{
-        glusterfs_dir_t                 gdir = NULL;
-        struct booster_dir_handle       *bh = NULL;
-        DIR                             *pdir = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "opendir: path: %s", path);
-        bh = calloc (1, sizeof (struct booster_dir_handle));
-        if (!bh) {
-                gf_log ("booster", GF_LOG_ERROR, "memory allocation failed");
-                errno = ENOMEM;
-                goto out;
-        }
-
-        gdir = glusterfs_opendir (path);
-        if (gdir) {
-                gf_log ("booster", GF_LOG_TRACE, "Gluster dir opened");
-                bh->type = BOOSTER_GL_DIR;
-                bh->dirh = (void *)gdir;
-                goto out;
-        } else if ((!gdir) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "Opendir failed");
-                goto free_out;
-        }
-
-        if (real_opendir == NULL) {
-                errno = ENOSYS;
-                goto free_out;
-        }
-
-        pdir = real_opendir (path);
-
-        if (pdir) {
-                bh->type = BOOSTER_POSIX_DIR;
-                bh->dirh = (void *)pdir;
-                goto out;
-        }
-
-free_out:
-        if (bh) {
-                free (bh);
-                bh = NULL;
-        }
-out:
-        return (DIR *)bh;
-}
-
-int __REDIRECT (booster_false_readdir_r, (DIR *dir, struct dirent *entry,
-                struct dirent **result), readdir_r) __nonnull ((1));
-int __REDIRECT (booster_false_readdir64_r, (DIR *dir, struct dirent64 *entry,
-                struct dirent64 **result), readdir64_r) __nonnull ((1));
-
-int
-booster_false_readdir_r (DIR *dir, struct dirent *entry, struct dirent **result)
-{
-        struct booster_dir_handle       *bh = (struct booster_dir_handle *)dir;
-        int                              ret = 0;  
-
-        if (!bh) {
-                ret = errno = EFAULT;
-                goto out;
-        }
-
-        if (bh->type == BOOSTER_GL_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "readdir_r on gluster");
-                ret = glusterfs_readdir_r ((glusterfs_dir_t)bh->dirh, entry,
-                                           result);
-                
-        } else if (bh->type == BOOSTER_POSIX_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "readdir_r on posix");
-                if (real_readdir_r == NULL) {
-                        ret = errno = ENOSYS;
-                        goto out;
-                }
-
-                ret = real_readdir_r ((DIR *)bh->dirh, entry, result);
-        } else {
-                ret = errno = EINVAL;
-        }
-
-out:
-        return  ret;
-}
-
-int
-booster_false_readdir64_r (DIR *dir, struct dirent64 *entry,
-                           struct dirent64 **result)
-{
-        struct booster_dir_handle       *bh = (struct booster_dir_handle *)dir;
-        int                              ret = 0;  
-
-        if (!bh) {
-                ret = errno = EFAULT;
-                goto out;
-        }
-
-        if (bh->type == BOOSTER_GL_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "readdir_r on gluster");
-                ret = glusterfs_readdir_r ((glusterfs_dir_t)bh->dirh,
-                                           (struct dirent *)entry,
-                                           (struct dirent **)result);
-        } else if (bh->type == BOOSTER_POSIX_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "readdir_r on posix");
-                if (real_readdir64_r == NULL) {
-                        ret = errno = ENOSYS;
-                        goto out;
-                }
-
-                ret = real_readdir64_r ((DIR *)bh->dirh, entry, result);
-        } else {
-                ret = errno = EINVAL;
-        }
-
-out:
-        return  ret;
-}
-
-struct dirent *
-__REDIRECT (booster_false_readdir, (DIR *dir), readdir) __nonnull ((1));
-
-struct dirent64 *
-__REDIRECT (booster_false_readdir64, (DIR *dir), readdir64) __nonnull ((1));
-
-struct dirent *
-booster_false_readdir (DIR *dir)
-{
-        struct booster_dir_handle       *bh = (struct booster_dir_handle *)dir;
-        struct dirent                   *dirp = NULL;
-
-        if (!bh) {
-                errno = EFAULT;
-                goto out;
-        }
-
-        if (bh->type == BOOSTER_GL_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "readdir on gluster");
-                dirp = glusterfs_readdir ((glusterfs_dir_t)bh->dirh);
-        } else if (bh->type == BOOSTER_POSIX_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "readdir on posix");
-                if (real_readdir == NULL) {
-                        errno = ENOSYS;
-                        dirp = NULL;
-                        goto out;
-                }
-
-                dirp = real_readdir ((DIR *)bh->dirh);
-        } else {
-                dirp = NULL;
-                errno = EINVAL;
-        }
-
-out:
-        return  dirp;
-}
-
-struct dirent64 *
-booster_false_readdir64 (DIR *dir)
-{
-        struct booster_dir_handle       *bh = (struct booster_dir_handle *)dir;
-        struct dirent64                 *dirp = NULL;
-
-        if (!bh) {
-                errno = EFAULT;
-                goto out;
-        }
-
-        if (bh->type == BOOSTER_GL_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "readdir on gluster");
-                dirp = glusterfs_readdir ((glusterfs_dir_t)bh->dirh);
-        } else if (bh->type == BOOSTER_POSIX_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "readdir on posix");
-                if (real_readdir == NULL) {
-                        errno = ENOSYS;
-                        dirp = NULL;
-                        goto out;
-                }
-
-                dirp = real_readdir64 ((DIR *)bh->dirh);
-        } else {
-                dirp = NULL;
-                errno = EINVAL;
-        }
-
-out:
-        return  dirp;
-}
-
-int
-closedir (DIR *dh)
-{
-        struct booster_dir_handle       *bh = (struct booster_dir_handle *)dh;
-        int                             ret = -1;
-
-        if (!bh) {
-                errno = EFAULT;
-                goto out;
-        }
-
-        if (bh->type == BOOSTER_GL_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "closedir on gluster");
-                ret = glusterfs_closedir ((glusterfs_dir_t)bh->dirh);
-        } else if (bh->type == BOOSTER_POSIX_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "closedir on posix");
-                if (real_closedir == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else
-                        ret = real_closedir ((DIR *)bh->dirh);
-        } else {
-                errno = EBADF;
-        }
-
-        if (ret == 0) {
-                free (bh);
-                bh = NULL;
-        }
-out:
-        return ret;
-}
-
-/* The real stat functions reside in booster_stat.c to
- * prevent clash with the statX prototype and functions
- * declared from sys/stat.h
- */
-int
-booster_xstat (int ver, const char *path, void *buf)
-{
-        struct stat     *sbuf = (struct stat *)buf;
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "xstat: path: %s", path);
-        ret = glusterfs_stat (path, sbuf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "xstat failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "xstat succeeded");
-                goto out;
-        }
-
-        if (real___xstat == NULL) {
-                ret = -1;
-                errno = ENOSYS;
-                goto out;
-        }
-
-        ret = real___xstat (ver, path, sbuf);
-out:
-        return ret;
-}
-
-int
-booster_xstat64 (int ver, const char *path, void *buf)
-{
-        int             ret = -1;
-        struct stat64   *sbuf = (struct stat64 *)buf;
-
-        gf_log ("booster", GF_LOG_TRACE, "xstat64: path: %s", path);
-        ret = glusterfs_stat (path, (struct stat *)sbuf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "xstat64 failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "xstat64 succeeded");
-                goto out;
-        }
-
-        if (real___xstat64 == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-        ret = real___xstat64 (ver, path, sbuf);
-out:
-        return ret;
-}
-
-int
-booster_stat (const char *path, void *buf)
-{
-        struct stat     *sbuf = (struct stat *)buf;
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "stat: path: %s", path);
-        ret = glusterfs_stat (path, sbuf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "stat failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "stat succeeded");
-                goto out;
-        }
-
-        if (real_stat != NULL)
-                ret = real_stat (path, sbuf);
-        else if (real___xstat != NULL)
-                ret = real___xstat (0, path, sbuf);
-        else {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-
-out:
-        return ret;
-}
-
-int
-booster_stat64 (const char *path, void *buf)
-{
-        int             ret = -1;
-        struct stat64   *sbuf = (struct stat64 *)buf;
-
-        gf_log ("booster", GF_LOG_TRACE, "stat64: %s", path);
-        ret = glusterfs_stat (path, (struct stat *)sbuf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "stat64 failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "stat64 succeeded");
-                goto out;
-        }
-
-        if (real_stat64 != NULL)
-                ret = real_stat64 (path, sbuf);
-        else if (real___xstat64 != NULL)
-                ret = real___xstat64 (0, path, sbuf);
-        else {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-out:
-        return ret;
-}
-
-int
-booster_fxstat (int ver, int fd, void *buf)
-{
-        struct stat             *sbuf = (struct stat *)buf;
-        int                     ret = -1;
-        glusterfs_file_t        fh = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "fxstat: fd %d", fd);
-        fh = booster_fdptr_get (booster_fdtable, fd);
-        if (!fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real___fxstat == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                        goto out;
-                }
-
-                ret = real___fxstat (ver, fd, sbuf);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_fstat (fh, sbuf);
-                booster_fdptr_put (fh);
-        }
-
-out:
-        return ret;
-}
-
-int
-booster_fxstat64 (int ver, int fd, void *buf)
-{
-        int                     ret = -1;
-        struct stat64           *sbuf = (struct stat64 *)buf;
-        glusterfs_file_t        fh = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "fxstat64: fd %d", fd);
-        fh = booster_fdptr_get (booster_fdtable, fd);
-        if (!fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real___fxstat64 == NULL) {
-                        ret = -1;
-                        errno = ENOSYS;
-                        goto out;
-                }
-                ret = real___fxstat64 (ver, fd, sbuf);
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_fstat (fh, (struct stat *)sbuf);
-                booster_fdptr_put (fh);
-        }
-
-out:
-        return ret;
-}
-
-int
-booster_fstat (int fd, void *buf)
-{
-        struct stat             *sbuf = (struct stat *)buf;
-        int                     ret = -1;
-        glusterfs_file_t        fh = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "fstat: fd %d", fd);
-        fh = booster_fdptr_get (booster_fdtable, fd);
-        if (!fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_fstat != NULL)
-                        ret = real_fstat (fd, sbuf);
-                else if (real___fxstat != NULL)
-                        ret = real___fxstat (0, fd, sbuf);
-                else {
-                        ret = -1;
-                        errno = ENOSYS;
-                        goto out;
-                }
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_fstat (fh, sbuf);
-                booster_fdptr_put (fh);
-        }
-
-out:
-        return ret;
-}
-
-int
-booster_fstat64 (int fd, void *buf)
-{
-        int                     ret = -1;
-        struct stat64           *sbuf = (struct stat64 *)buf;
-        glusterfs_file_t        fh = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "fstat64: fd %d", fd);
-        fh = booster_fdptr_get (booster_fdtable, fd);
-        if (!fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_fstat64 != NULL)
-                        ret = real_fstat64 (fd, sbuf);
-                else if (real___fxstat64 != NULL)
-                        /* Not sure how portable the use of 0 for
-                         * version number is but it works over glibc.
-                         * We need this because, I've
-                         * observed that all the above real* functors can be
-                         * NULL. In that case, this is our last and only option.
-                         */
-                        ret = real___fxstat64 (0, fd, sbuf);
-                else {
-                        ret = -1;
-                        errno = ENOSYS;
-                        goto out;
-                }
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_fstat (fh, (struct stat *)sbuf);
-                booster_fdptr_put (fh);
-        }
-
-out:
-        return ret;
-}
-
-int
-booster_lxstat (int ver, const char *path, void *buf)
-{
-        struct stat     *sbuf = (struct stat *)buf;
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "lxstat: path %s", path);
-        ret = glusterfs_lstat (path, sbuf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "lxstat failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "lxstat succeeded");
-                goto out;
-        }
-
-        if (real___lxstat == NULL) {
-                ret = -1;
-                errno = ENOSYS;
-                goto out;
-        }
-
-        ret = real___lxstat (ver, path, sbuf);
-out:
-        return ret;
-}
-
-int
-booster_lxstat64 (int ver, const char *path, void *buf)
-{
-        int             ret = -1;
-        struct stat64   *sbuf = (struct stat64 *)buf;
-
-        gf_log ("booster", GF_LOG_TRACE, "lxstat64: path %s", path);
-        ret = glusterfs_lstat (path, (struct stat *)sbuf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "lxstat64 failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "lxstat64 succeeded");
-                goto out;
-        }
-
-        if (real___lxstat64 == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-        ret = real___lxstat64 (ver, path, sbuf);
-out:
-        return ret;
-}
-
-int
-booster_lstat (const char *path, void *buf)
-{
-        struct stat     *sbuf = (struct stat *)buf;
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "lstat: path %s", path);
-        ret = glusterfs_lstat (path, sbuf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "lstat failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "lstat succeeded");
-                goto out;
-        }
-
-        if (real_lstat != NULL)
-                ret = real_lstat (path, sbuf);
-        else if (real___lxstat != NULL)
-                ret = real___lxstat (0, path, sbuf);
-        else {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-
-out:
-        return ret;
-}
-
-int
-booster_lstat64 (const char *path, void *buf)
-{
-        int             ret = -1;
-        struct stat64   *sbuf = (struct stat64 *)buf;
-
-        gf_log ("booster", GF_LOG_TRACE, "lstat64: path %s", path);
-        ret = glusterfs_lstat (path, (struct stat *)sbuf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "lstat64 failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "lstat64 succeeded");
-                goto out;
-        }
-
-        if (real_lstat64 != NULL)
-                ret = real_lstat64 (path, sbuf);
-        else if (real___lxstat64 != NULL)
-                ret = real___lxstat64 (0, path, sbuf);
-        else {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-out:
-        return ret;
-}
-
-int
-booster_statfs (const char *pathname, struct statfs *buf)
-{
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "statfs: path %s", pathname);
-        ret = glusterfs_statfs (pathname, buf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "statfs failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "statfs succeeded");
-                goto out;
-        }
-
-        if (real_statfs == NULL) {
-                ret = -1;
-                errno = ENOSYS;
-                goto out;
-        }
-
-        ret = real_statfs (pathname, buf);
-
-out:
-        return ret;
-}
-
-int
-booster_statfs64 (const char *pathname, struct statfs64 *buf)
-{
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "stat64: path %s", pathname);
-        ret = glusterfs_statfs (pathname, (struct statfs *)buf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "statfs64 failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "statfs64 succeeded");
-                goto out;
-        }
-
-        if (real_statfs64 == NULL) {
-                ret = -1;
-                errno = ENOSYS;
-                goto out;
-        }
-
-        ret = real_statfs64 (pathname, buf);
-
-out:
-        return ret;
-}
-
-int
-booster_statvfs (const char *pathname, struct statvfs *buf)
-{
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "statvfs: path %s", pathname);
-        ret = glusterfs_statvfs (pathname, buf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "statvfs failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "statvfs succeeded");
-                goto out;
-        }
-
-        if (real_statvfs == NULL) {
-                ret = -1;
-                errno = ENOSYS;
-                goto out;
-        }
-
-        ret = real_statvfs (pathname, buf);
-
-out:
-        return ret;
-}
-
-int
-booster_statvfs64 (const char *pathname, struct statvfs64 *buf)
-{
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "statvfs64: path %s", pathname);
-        ret = glusterfs_statvfs (pathname, (struct statvfs *)buf);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "statvfs64 failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "statvfs64 succeeded");
-                goto out;
-        }
-
-        if (real_statvfs64 == NULL) {
-                ret = -1;
-                errno = ENOSYS;
-                goto out;
-        }
-
-        ret = real_statvfs64 (pathname, buf);
-
-out:
-        return ret;
-}
-
-ssize_t
-getxattr (const char *path, const char *name, void *value, size_t size)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "getxattr: path %s, name %s", path,
-                name);
-        ret = glusterfs_getxattr (path, name, value, size);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "getxattr failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret > 0) {
-                gf_log ("booster", GF_LOG_TRACE, "getxattr succeeded");
-                return ret;
-        }
-
-        if (real_getxattr == NULL) {
-                ret = -1;
-                errno = ENOSYS;
-                goto out;
-        }
-
-        ret = real_getxattr (path, name, value, size);
-out:
-        return ret;
-}
-
-
-ssize_t
-lgetxattr (const char *path, const char *name, void *value, size_t size)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "lgetxattr: path %s, name %s", path,
-                name);
-        ret = glusterfs_lgetxattr (path, name, value, size);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "lgetxattr failed: %s",
-                        strerror (errno));
-
-                goto out;
-        }
-
-        if (ret > 0) {
-                gf_log ("booster", GF_LOG_TRACE, "lgetxattr succeeded");
-                return ret;
-        }
-
-        if (real_lgetxattr == NULL) {
-                ret = -1;
-                errno = ENOSYS;
-                goto out;
-        }
-
-        ret = real_lgetxattr (path, name, value, size);
-out:
-        return ret;
-}
-
-int
-remove (const char *path)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "remove: %s", path);
-        ret = glusterfs_remove (path);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "remove failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "remove succeeded");
-                goto out;
-        }
-
-        if (real_remove == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-        ret = real_remove (path);
-
-out:
-        return ret;
-}
-
-int
-lchown (const char *path, uid_t owner, gid_t group)
-{
-        int     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "lchown: path %s", path);
-        ret = glusterfs_lchown (path, owner, group);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "lchown failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_ERROR, "lchown succeeded");
-                goto out;
-        }
-
-        if (real_lchown == NULL) {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-        ret = real_lchown (path, owner, group);
-
-out:
-        return ret;
-}
-
-void
-booster_rewinddir (DIR *dir)
-{
-        struct booster_dir_handle       *bh = (struct booster_dir_handle *)dir;
-
-        if (!bh) {
-                errno = EFAULT;
-                goto out;
-        }
-
-        if (bh->type == BOOSTER_GL_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "rewinddir on glusterfs");
-                glusterfs_rewinddir ((glusterfs_dir_t)bh->dirh);
-        } else if (bh->type == BOOSTER_POSIX_DIR) {
-                if (real_rewinddir == NULL) {
-                        errno = ENOSYS;
-                        goto out;
-                }
-                gf_log ("booster", GF_LOG_TRACE, "rewinddir on posix");
-                real_rewinddir ((DIR *)bh->dirh);
-        } else
-                errno = EINVAL;
-out:
-        return;
-}
-
-void
-booster_seekdir (DIR *dir, off_t offset)
-{
-        struct booster_dir_handle       *bh = (struct booster_dir_handle *)dir;
-
-        if (!bh) {
-                errno = EFAULT;
-                goto out;
-        }
-
-        if (bh->type == BOOSTER_GL_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "seekdir on glusterfs");
-                glusterfs_seekdir ((glusterfs_dir_t)bh->dirh, offset);
-         } else if (bh->type == BOOSTER_POSIX_DIR) {
-                if (real_seekdir == NULL) {
-                        errno = ENOSYS;
-                        goto out;
-                }
-
-                gf_log ("booster", GF_LOG_TRACE, "seekdir on posix");
-                real_seekdir ((DIR *)bh->dirh, offset);
-        } else
-                errno = EINVAL;
-out:
-        return;
-}
-
-off_t
-booster_telldir (DIR *dir)
-{
-        struct booster_dir_handle       *bh = (struct booster_dir_handle *)dir;
-        off_t	offset = -1;
-
-        if (!bh) {
-                errno = EFAULT;
-                goto out;
-        }
-
-        if (bh->type == BOOSTER_GL_DIR) {
-                gf_log ("booster", GF_LOG_TRACE, "telldir on glusterfs");
-                offset = glusterfs_telldir ((glusterfs_dir_t)bh->dirh);
-        } else if (bh->type == BOOSTER_POSIX_DIR) {
-                if (real_telldir == NULL) {
-                        errno = ENOSYS;
-                        goto out;
-                }
-
-                gf_log ("booster", GF_LOG_TRACE, "telldir on posix");
-                offset = real_telldir ((DIR *)bh->dirh);
-        } else
-                errno = EINVAL;
-out:
-        return offset;
-}
-
-
-pid_t 
-fork (void)
-{
-	pid_t pid = 0;
-	char child = 0;
-
-	glusterfs_log_lock ();
-	{
-		pid = real_fork ();
-	}
-	glusterfs_log_unlock ();
-
-	child = (pid == 0);
-	if (child) {
-		booster_cleanup ();
-		booster_init ();
-	}
-
-	return pid;
-}
-
-ssize_t
-sendfile (int out_fd, int in_fd, off_t *offset, size_t count)
-{
-        glusterfs_file_t            in_fh = NULL;
-        ssize_t                     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "sendfile: in fd %d, out fd %d, offset"
-                " %"PRIu64", count %"GF_PRI_SIZET, in_fd, out_fd, *offset,
-                count);
-        /*
-         * handle sendfile in booster only if in_fd corresponds to a glusterfs
-         * file handle 
-         */
-        in_fh = booster_fdptr_get (booster_fdtable, in_fd);
-        if (!in_fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_sendfile == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else {
-                        ret = real_sendfile (out_fd, in_fd, offset, count);
-                }
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_sendfile (out_fd, in_fh, offset, count);
-                booster_fdptr_put (in_fh);
-        }
-        
-        return ret;
-}
-
-ssize_t
-sendfile64 (int out_fd, int in_fd, off_t *offset, size_t count)
-{
-        glusterfs_file_t            in_fh = NULL;
-        ssize_t                     ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "sendfile64: in fd %d, out fd %d,"
-                " offset %"PRIu64", count %"GF_PRI_SIZET, in_fd, out_fd,
-                *offset, count);
-        /*
-         * handle sendfile in booster only if in_fd corresponds to a glusterfs
-         * file handle 
-         */
-        in_fh = booster_fdptr_get (booster_fdtable, in_fd);
-        if (!in_fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_sendfile64 == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else {
-                        ret = real_sendfile64 (out_fd, in_fd, offset, count);
-                }
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_sendfile (out_fd, in_fh, offset, count);
-                booster_fdptr_put (in_fh);
-        }
-        
-        return ret;
-}
-
-
-int
-fcntl (int fd, int cmd, ...)
-{
-        va_list           ap;
-        int               ret = -1;
-        long              arg = 0;
-        struct flock     *lock = NULL;
-        glusterfs_file_t  glfs_fd = 0; 
-
-        glfs_fd = booster_fdptr_get (booster_fdtable, fd);
-
-        gf_log ("booster", GF_LOG_TRACE, "fcntl: fd %d, cmd %d", fd, cmd);
-	switch (cmd) {
-	case F_DUPFD:
-                ret = dup (fd);
-                break;
-                /* 
-                 * FIXME: Consider this case when implementing F_DUPFD, F_GETFD
-                 *        etc flags in libglusterfsclient. Commenting it out for
-                 *        timebeing since it is defined only in linux kernel 
-                 *        versions >= 2.6.24.
-                 */
-                /* case F_DUPFD_CLOEXEC: */
-	case F_GETFD:
-                if (glfs_fd != NULL) {
-                        ret = booster_get_close_on_exec (booster_fdtable, fd)
-                                ? FD_CLOEXEC : 0;
-                } else {
-                        if (real_fcntl == NULL) {
-                                ret = -1;
-                                errno = ENOSYS;
-                        } else {
-                                ret = real_fcntl (fd, cmd);
-                        }
-                }
-                break;
-
-	case F_GETFL:
-	case F_GETOWN:
-	case F_GETSIG:
-	case F_GETLEASE:
-                if (glfs_fd) {
-                        gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                        ret = glusterfs_fcntl (glfs_fd, cmd);
-                } else {
-                        if (!real_fcntl) {
-                                errno = ENOSYS;
-                                goto out;
-                        }
-
-                        gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                        ret = real_fcntl (fd, cmd);
-                }
-		break;
-
-	case F_SETFD:
-                if (glfs_fd != NULL) {
-                        booster_set_close_on_exec (booster_fdtable, fd);
-                        ret = 0;
-                } else {
-                        if (real_fcntl == NULL) {
-                                ret = -1;
-                                errno = ENOSYS;
-                        } else {
-                                ret = real_fcntl (fd, cmd);
-                        }
-                }
-                break;
-
-	case F_SETFL:
-	case F_SETOWN:
-	case F_SETSIG:
-	case F_SETLEASE:
-	case F_NOTIFY:
-                va_start (ap, cmd);
-                arg = va_arg (ap, long);
-                va_end (ap);
-
-                if (glfs_fd) {
-                        gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                        ret = glusterfs_fcntl (glfs_fd, cmd, arg);
-                } else {
-                        if (!real_fcntl) {
-                                errno = ENOSYS;
-                                goto out;
-                        }
-
-                        gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                        ret = real_fcntl (fd, cmd, arg);
-                }
-		break;
-
-	case F_GETLK:
-	case F_SETLK:
-	case F_SETLKW:
-#if F_GETLK != F_GETLK64 
-        case F_GETLK64:
-#endif
-#if F_SETLK != F_SETLK64
-        case F_SETLK64:
-#endif
-#if F_SETLKW != F_SETLKW64
-        case F_SETLKW64:
-#endif
-                va_start (ap, cmd);
-                lock = va_arg (ap, struct flock *);
-                va_end (ap);
-
-                if (lock == NULL) {
-                        errno = EINVAL;
-                        goto out;
-                }
-
-                if (glfs_fd) {
-                        gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                        ret = glusterfs_fcntl (glfs_fd, cmd, lock);
-                } else {
-                        if (!real_fcntl) {
-                                errno = ENOSYS;
-                                goto out;
-                        }
-
-                        gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                        ret = real_fcntl (fd, cmd, lock);
-                }
-		break;
-
-	default:
-                errno = EINVAL;
-		break;
-	}
-
-out:
-        if (glfs_fd) {
-                booster_fdptr_put (glfs_fd);
-        }
-
-        return ret;
-}
-
-
-int
-chdir (const char *path)
-{
-        int     ret = -1;
-        char    cwd[PATH_MAX];
-        char   *res = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "chdir: path %s", path);
-
-        pthread_mutex_lock (&cwdlock);
-        {
-                res = glusterfs_getcwd (cwd, PATH_MAX);
-                if (res == NULL) {
-                        gf_log ("booster", GF_LOG_ERROR, "getcwd failed: %s",
-                                strerror (errno));
-                        goto unlock;
-                }
-
-                ret = glusterfs_chdir (path);
-                if ((ret == -1) && (errno != ENODEV)) {
-                        gf_log ("booster", GF_LOG_ERROR, "chdir failed: %s",
-                                strerror (errno));
-                        goto unlock;
-                }
-
-                if (ret == 0) {
-                        gf_log ("booster", GF_LOG_TRACE, "chdir succeeded");
-                        goto unlock;
-                }
-
-                if (real_chdir == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                        goto unlock;
-                }
-
-                ret = real_chdir (path);
-                if (ret == -1) {
-                        glusterfs_chdir (cwd);
-                }
-        }
-unlock:
-        pthread_mutex_unlock (&cwdlock);
-
-        return ret;
-}
-
-
-int
-fchdir (int fd)
-{
-        int              ret     = -1;
-        glusterfs_file_t glfs_fd = 0;
-        char             cwd[PATH_MAX]; 
-        char            *res     = NULL;
-
-        glfs_fd = booster_fdptr_get (booster_fdtable, fd);
-
-        if (!glfs_fd) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_write == NULL) {
-                        errno = ENOSYS;
-                        ret = -1;
-                } else {
-                        ret = real_fchdir (fd);
-                        if (ret == 0) {
-                                res = real_getcwd (cwd, PATH_MAX);
-                                if (res == NULL) {
-                                        gf_log ("booster", GF_LOG_ERROR,
-                                                "getcwd failed (%s)",
-                                                strerror (errno));
-                                        ret = -1;
-                                } else {
-                                        glusterfs_chdir (cwd);
-                                }
-                        }
-                }
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_fchdir (glfs_fd);
-                booster_fdptr_put (glfs_fd);
-        }
- 
-        return ret;
-}
-
-
-char *
-getcwd (char *buf, size_t size)
-{
-        char *res = NULL;
-
-        res = glusterfs_getcwd (buf, size);
-        if ((res == NULL) && (errno == ENODEV)) {
-                res = real_getcwd (buf, size);
-        }
-
-        return res;
-}
-
-
-int __REDIRECT (booster_false_truncate, (const char *path, off_t length),
-                truncate) __nonnull ((1));
-int __REDIRECT (booster_false_truncate64, (const char *path, loff_t length),
-                truncate64) __nonnull ((1));;
-
-int
-booster_false_truncate (const char *path, off_t length)
-{
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "truncate: path (%s) length (%"PRIu64
-                ")", path, length);
-
-        ret = glusterfs_truncate (path, length);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "truncate failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "truncate succeeded");
-                goto out;
-        }
-
-        if (real_truncate != NULL)
-                ret = real_truncate (path, length);
-        else {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-out:
-        return ret;
-}
-
-
-int
-booster_false_truncate64 (const char *path, loff_t length)
-{
-        int             ret = -1;
-  
-        gf_log ("booster", GF_LOG_TRACE, "truncate64: path (%s) length "
-                "(%"PRIu64")", path, length);
-
-        ret = glusterfs_truncate (path, length);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "truncate64 failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "truncate64 succeeded");
-                goto out;
-        }
-
-        if (real_truncate64 != NULL)
-                ret = real_truncate64 (path, length);
-        else {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-out:
-        return ret;
-}
-
-
-int
-setxattr (const char *path, const char *name, const void *value, size_t size,
-          int flags)
-{
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "setxattr: path: %s", path);
-        ret = glusterfs_setxattr (path, name, value, size, flags);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "setxattr failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "setxattr succeeded");
-                goto out;
-        }
-
-        if (real_setxattr != NULL)
-                ret = real_setxattr (path, name, value, size, flags);
-        else {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-out:
-        return ret;
-}
-
-
-int
-lsetxattr (const char *path, const char *name, const void *value, size_t size,
-           int flags)
-{
-        int             ret = -1;
-
-        gf_log ("booster", GF_LOG_TRACE, "lsetxattr: path: %s", path);
-        ret = glusterfs_lsetxattr (path, name, value, size, flags);
-        if ((ret == -1) && (errno != ENODEV)) {
-                gf_log ("booster", GF_LOG_ERROR, "lsetxattr failed: %s",
-                        strerror (errno));
-                goto out;
-        }
-
-        if (ret == 0) {
-                gf_log ("booster", GF_LOG_TRACE, "lsetxattr succeeded");
-                goto out;
-        }
-
-        if (real_lsetxattr != NULL)
-                ret = real_lsetxattr (path, name, value, size, flags);
-        else {
-                errno = ENOSYS;
-                ret = -1;
-                goto out;
-        }
-
-out:
-        return ret;
-}
-
-
-int
-fsetxattr (int fd, const char *name, const void *value, size_t size, int flags)
-{
-        int                     ret = -1;
-        glusterfs_file_t        fh = NULL;
-
-        gf_log ("booster", GF_LOG_TRACE, "fsetxattr: fd %d", fd);
-        fh = booster_fdptr_get (booster_fdtable, fd);
-        if (!fh) {
-                gf_log ("booster", GF_LOG_TRACE, "Not a booster fd");
-                if (real_fsetxattr != NULL)
-                        ret = real_fsetxattr (fd, name, value, size, flags);
-                else {
-                        ret = -1;
-                        errno = ENOSYS;
-                        goto out;
-                }
-        } else {
-                gf_log ("booster", GF_LOG_TRACE, "Is a booster fd");
-                ret = glusterfs_fsetxattr (fh, name, value, size, flags);
-                booster_fdptr_put (fh);
-        }
-
-out:
-        return ret;
-}
-
-
-void
-booster_lib_init (void)
-{
-
-        RESOLVE (open);
-        RESOLVE (open64);
-        RESOLVE (creat);
-        RESOLVE (creat64);
-
-        RESOLVE (read);
-        RESOLVE (readv);
-        RESOLVE (pread);
-        RESOLVE (pread64);
-
-        RESOLVE (write);
-        RESOLVE (writev);
-        RESOLVE (pwrite);
-        RESOLVE (pwrite64);
-
-        RESOLVE (lseek);
-        RESOLVE (lseek64);
-
-        RESOLVE (close);
-
-        RESOLVE (dup);
-        RESOLVE (dup2);
-
-	RESOLVE (fork); 
-        RESOLVE (mkdir);
-        RESOLVE (rmdir);
-        RESOLVE (chmod);
-        RESOLVE (chown);
-        RESOLVE (fchmod);
-        RESOLVE (fchown);
-        RESOLVE (fsync);
-        RESOLVE (ftruncate);
-        RESOLVE (ftruncate64);
-        RESOLVE (link);
-        RESOLVE (rename);
-        RESOLVE (utimes);
-        RESOLVE (utime);
-        RESOLVE (mknod);
-        RESOLVE (mkfifo);
-        RESOLVE (unlink);
-        RESOLVE (symlink);
-        RESOLVE (readlink);
-        RESOLVE (realpath);
-        RESOLVE (opendir);
-        RESOLVE (readdir);
-        RESOLVE (readdir64);
-        RESOLVE (closedir);
-        RESOLVE (__xstat);
-        RESOLVE (__xstat64);
-        RESOLVE (stat);
-        RESOLVE (stat64);
-        RESOLVE (__fxstat);
-        RESOLVE (__fxstat64);
-        RESOLVE (fstat);
-        RESOLVE (fstat64);
-        RESOLVE (__lxstat);
-        RESOLVE (__lxstat64);
-        RESOLVE (lstat);
-        RESOLVE (lstat64);
-        RESOLVE (statfs);
-        RESOLVE (statfs64);
-        RESOLVE (statvfs);
-        RESOLVE (statvfs64);
-        RESOLVE (getxattr);
-        RESOLVE (lgetxattr);
-        RESOLVE (remove);
-        RESOLVE (lchown);
-	RESOLVE (rewinddir);
-	RESOLVE (seekdir);
-	RESOLVE (telldir);
-        RESOLVE (sendfile);
-        RESOLVE (sendfile64);
-        RESOLVE (readdir_r);
-        RESOLVE (readdir64_r);
-        RESOLVE (fcntl);
-        RESOLVE (chdir);
-        RESOLVE (fchdir);
-        RESOLVE (getcwd);
-        RESOLVE (truncate);
-        RESOLVE (truncate64);
-        RESOLVE (setxattr);
-        RESOLVE (lsetxattr);
-        RESOLVE (fsetxattr);
-
-        /* This must be called after resolving real functions
-         * above so that the socket based IO calls in libglusterfsclient
-         * can fall back to a non-NULL real_XXX function pointer.
-         * Calling booster_init before resolving the names above
-         * results in seg-faults because the function symbols above are NULL.
-         */
-	booster_init ();
-}
-
diff --git a/booster/src/booster_fstab.c b/booster/src/booster_fstab.c
deleted file mode 100644
index 202249cad..000000000
--- a/booster/src/booster_fstab.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/* Utilities for reading/writing fstab, mtab, etc.
-   Copyright (C) 1995-2000, 2001, 2002, 2003, 2006
-   Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <alloca.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include "booster_fstab.h"
-#include <stdlib.h>
-#include <libglusterfsclient.h>
-#include <errno.h>
-
-/* The default timeout for inode and stat cache. */
-#define BOOSTER_DEFAULT_ATTR_TIMEO      5 /* In Secs */
-
-/* Prepare to begin reading and/or writing mount table entries from the
-   beginning of FILE.  MODE is as for `fopen'.  */
-glusterfs_fstab_t *
-glusterfs_fstab_init (const char *file, const char *mode)
-{
-        glusterfs_fstab_t *handle = NULL;
-        handle = calloc (1, sizeof (glusterfs_fstab_t));
-        if (!handle) {
-                gf_log ("booster-fstab", GF_LOG_ERROR, "Memory allocation"
-                        " failed");
-                goto out;
-        }
-
-        gf_log ("booster-fstab", GF_LOG_DEBUG, "FSTAB file: %s", file);
-        FILE *result = fopen (file,mode);
-        if (result != NULL) {
-                handle->fp = result;
-        } else {
-                gf_log ("booster-fstab", GF_LOG_ERROR, "FSTAB file open failed:"
-                        " %s", strerror (errno));
-                free (handle);
-                handle = NULL;
-        }
-
-out:
-
-        return handle;
-}
-
-int
-glusterfs_fstab_close (glusterfs_fstab_t *h)
-{
-        if (!h)
-                return -1;
-
-        if (h->fp)
-                fclose (h->fp);
-
-        return 0;
-}
-
-/* Since the values in a line are separated by spaces, a name cannot
-   contain a space.  Therefore some programs encode spaces in names
-   by the strings "\040".  We undo the encoding when reading an entry.
-   The decoding happens in place.  */
-static char *
-decode_name (char *buf)
-{
-        char *rp = buf;
-        char *wp = buf;
-
-        do
-                if (rp[0] == '\\' && rp[1] == '0' && rp[2] == '4'
-                                && rp[3] == '0')
-                {
-                        /* \040 is a SPACE.  */
-                        *wp++ = ' ';
-                        rp += 3;
-                }
-                else if (rp[0] == '\\' && rp[1] == '0' && rp[2] == '1'
-                                && rp[3] == '1')
-                {
-                        /* \011 is a TAB.  */
-                        *wp++ = '\t';
-                        rp += 3;
-                }
-                else if (rp[0] == '\\' && rp[1] == '0' && rp[2] == '1'
-                                && rp[3] == '2')
-                {
-                        /* \012 is a NEWLINE.  */
-                        *wp++ = '\n';
-                        rp += 3;
-                }
-                else if (rp[0] == '\\' && rp[1] == '\\')
-                {
-                        /* We have to escape \\ to be able to represent all
-                         * characters.  */
-                        *wp++ = '\\';
-                        rp += 1;
-                }
-                else if (rp[0] == '\\' && rp[1] == '1' && rp[2] == '3'
-                                && rp[3] == '4')
-                {
-                        /* \134 is also \\.  */
-                        *wp++ = '\\';
-                        rp += 3;
-                }
-                else
-                        *wp++ = *rp;
-        while (*rp++ != '\0');
-
-        return buf;
-}
-
-
-/* Read one mount table entry from STREAM.  Returns a pointer to storage
-   reused on the next call, or null for EOF or error (use feof/ferror to
-   check).  */
-struct glusterfs_mntent *
-__glusterfs_fstab_getent (FILE *stream, struct glusterfs_mntent *mp,
-                          char *buffer, int bufsiz)
-{
-        char *cp;
-        char *head;
-
-        do
-        {
-                char *end_ptr;
-
-                if (fgets (buffer, bufsiz, stream) == NULL)
-                {
-                        return NULL;
-                }
-
-                end_ptr = strchr (buffer, '\n');
-                if (end_ptr != NULL)	/* chop newline */
-                        *end_ptr = '\0';
-                else
-                {
-                        /* Not the whole line was read.  Do it now but forget
-                         * it.  */
-                        char tmp[1024];
-                        while (fgets (tmp, sizeof tmp, stream) != NULL)
-                                if (strchr (tmp, '\n') != NULL)
-                                        break;
-                }
-
-                head = buffer + strspn (buffer, " \t");
-                /* skip empty lines and comment lines:  */
-        }
-        while (head[0] == '\0' || head[0] == '#');
-
-        cp = strsep (&head, " \t");
-        mp->mnt_fsname = cp != NULL ? decode_name (cp) : (char *) "";
-        if (head)
-                head += strspn (head, " \t");
-        cp = strsep (&head, " \t");
-        mp->mnt_dir = cp != NULL ? decode_name (cp) : (char *) "";
-        if (head)
-                head += strspn (head, " \t");
-        cp = strsep (&head, " \t");
-        mp->mnt_type = cp != NULL ? decode_name (cp) : (char *) "";
-        if (head)
-                head += strspn (head, " \t");
-        cp = strsep (&head, " \t");
-        mp->mnt_opts = cp != NULL ? decode_name (cp) : (char *) "";
-        switch (head ? sscanf (head, " %d %d ", &mp->mnt_freq,
-                               &mp->mnt_passno) : 0)
-        {
-                case 0:
-                        mp->mnt_freq = 0;
-                case 1:
-                        mp->mnt_passno = 0;
-                case 2:
-                        break;
-        }
-
-        return mp;
-}
-
-struct glusterfs_mntent *
-glusterfs_fstab_getent (glusterfs_fstab_t *h)
-{
-        if (!h)
-                return NULL;
-
-        if (!h->fp)
-                return NULL;
-
-        return __glusterfs_fstab_getent (h->fp, &h->tmpent, h->buf,
-                                         GF_MNTENT_BUFSIZE);
-}
-
-/* We have to use an encoding for names if they contain spaces or tabs.
-   To be able to represent all characters we also have to escape the
-   backslash itself.  This "function" must be a macro since we use
-   `alloca'.  */
-#define encode_name(name)                                               \
-        do {                                                            \
-                const char *rp = name;		                        \
-                                                                        \
-                while (*rp != '\0')     	                        \
-                        if (*rp == ' ' || *rp == '\t' || *rp == '\\')   \
-                                break;                                  \
-                        else	                                        \
-                                ++rp;                                   \
-                                                                        \
-                if (*rp != '\0')                                        \
-                {                                               \
-                /* In the worst case the length of the string   \
-                 * can increase to four times the current       \
-                 * length.  */				        \
-                        char *wp;				\
-                                                                \
-                        rp = name;				\
-                        name = wp = (char *) alloca (strlen (name) * 4 + 1);                                                                 \
-                                                                \
-                        do {                            \
-                                if (*rp == ' ')		\
-                                {       		\
-                                        *wp++ = '\\';   \
-                                        *wp++ = '0';	\
-                                        *wp++ = '4';	\
-                                        *wp++ = '0';    \
-                                }			\
-                                else if (*rp == '\t')	\
-                                {			\
-                                        *wp++ = '\\';	\
-                                        *wp++ = '0';	\
-                                        *wp++ = '1';	\
-                                        *wp++ = '1';	\
-                                }			\
-                                else if (*rp == '\n')	\
-                                {	                \
-                                        *wp++ = '\\';	\
-                                        *wp++ = '0';	\
-                                        *wp++ = '1';	\
-                                        *wp++ = '2';	\
-                                }	                \
-                                else if (*rp == '\\')	\
-                                {                       \
-                                        *wp++ = '\\';	\
-                                        *wp++ = '\\';	\
-                                }                       \
-                                else	                \
-                                        *wp++ = *rp;	\
-                        } while (*rp++ != '\0');	\
-                }                                       \
-        } while (0)                                     \
-
-
-int
-glusterfs_fstab_addent (glusterfs_fstab_t *h,
-                const struct glusterfs_mntent *mnt)
-{
-        struct glusterfs_mntent mntcopy = *mnt;
-        if (!h)
-                return -1;
-
-        if (!h->fp)
-                return -1;
-
-        if (fseek (h->fp, 0, SEEK_END))
-                return -1;
-
-        /* Encode spaces and tabs in the names.  */
-        encode_name (mntcopy.mnt_fsname);
-        encode_name (mntcopy.mnt_dir);
-        encode_name (mntcopy.mnt_type);
-        encode_name (mntcopy.mnt_opts);
-
-        return (fprintf (h->fp, "%s %s %s %s %d %d\n",
-                                mntcopy.mnt_fsname,
-                                mntcopy.mnt_dir,
-                                mntcopy.mnt_type,
-                                mntcopy.mnt_opts,
-                                mntcopy.mnt_freq,
-                                mntcopy.mnt_passno)
-                        < 0 ? 1 : 0);
-}
-
-
-/* Search MNT->mnt_opts for an option matching OPT.
-   Returns the address of the substring, or null if none found.  */
-char *
-glusterfs_fstab_hasoption (const struct glusterfs_mntent *mnt, const char *opt)
-{
-        const size_t optlen = strlen (opt);
-        char *rest = mnt->mnt_opts, *p;
-
-        while ((p = strstr (rest, opt)) != NULL)
-        {
-                if ((p == rest || p[-1] == ',')
-                                && (p[optlen] == '\0' || p[optlen] == '=' || p[optlen] == ','))
-                        return p;
-
-                rest = strchr (p, ',');
-                if (rest == NULL)
-                        break;
-                ++rest;
-        }
-
-        return NULL;
-}
-
-void
-clean_init_params (glusterfs_init_params_t *ipars)
-{
-        if (!ipars)
-                return;
-
-        if (ipars->volume_name)
-                free (ipars->volume_name);
-
-        if (ipars->specfile)
-                free (ipars->specfile);
-
-        if (ipars->logfile)
-                free (ipars->logfile);
-
-        if (ipars->loglevel)
-                free (ipars->loglevel);
-
-        return;
-}
-
-char *
-get_option_value (char *opt)
-{
-        char *val = NULL;
-        char *saveptr = NULL;
-        char *copy_opt = NULL;
-        char *retval = NULL;
-
-        copy_opt = strdup (opt);
-
-        /* Get the = before the value of the option. */
-        val = index (copy_opt, '=');
-        if (val) {
-                /* Move to start of option */
-                ++val;
-
-                /* Now, to create a '\0' delimited string out of the
-                 * options string, first get the position where the
-                 * next option starts, that would be the next ','.
-                 */
-                saveptr = index (val, ',');
-                if (saveptr)
-                        *saveptr = '\0';
-                retval = strdup (val);
-        }
-
-        free (copy_opt);
-
-        return retval;
-}
-
-void
-booster_mount (struct glusterfs_mntent *ent)
-{
-        char                    *opt = NULL;
-        glusterfs_init_params_t ipars;
-        time_t                  timeout = BOOSTER_DEFAULT_ATTR_TIMEO;
-        char                    *timeostr = NULL;
-        char                    *endptr = NULL;
-
-        if (!ent)
-                return;
-
-        gf_log ("booster-fstab", GF_LOG_DEBUG, "Mount entry: volfile: %s,"
-                " VMP: %s, Type: %s, Options: %s", ent->mnt_fsname,
-                ent->mnt_dir, ent->mnt_type, ent->mnt_opts);
-        if ((strcmp (ent->mnt_type, "glusterfs") != 0)) {
-                gf_log ("booster-fstab", GF_LOG_ERROR, "Type is not glusterfs");
-                return;
-        }
-
-        memset (&ipars, 0, sizeof (glusterfs_init_params_t));
-        if (ent->mnt_fsname)
-                ipars.specfile = strdup (ent->mnt_fsname);
-
-        opt = glusterfs_fstab_hasoption (ent, "subvolume");
-        if (opt)
-                ipars.volume_name = get_option_value (opt);
-
-        opt = glusterfs_fstab_hasoption (ent, "log-file");
-        if (!opt)
-                opt = glusterfs_fstab_hasoption (ent, "logfile");
-
-        if (opt)
-                ipars.logfile = get_option_value (opt);
-
-        opt = glusterfs_fstab_hasoption (ent, "log-level");
-        if (!opt)
-                opt = glusterfs_fstab_hasoption (ent, "loglevel");
-
-        if (opt)
-                ipars.loglevel = get_option_value (opt);
-
-        /* Attribute cache timeout */
-        opt = glusterfs_fstab_hasoption (ent, "attr_timeout");
-        if (opt) {
-                 timeostr = get_option_value (opt);
-                 if (timeostr)
-                         timeout = strtol (timeostr, &endptr, 10);
-        }
-
-        ipars.lookup_timeout = timeout;
-        ipars.stat_timeout = timeout;
-
-        if ((glusterfs_mount (ent->mnt_dir, &ipars)) == -1)
-                gf_log ("booster-fstab", GF_LOG_ERROR, "VMP mounting failed");
-
-        clean_init_params (&ipars);
-}
-
-int
-booster_configure (char *confpath)
-{
-        int                     ret = -1;
-        glusterfs_fstab_t       *handle = NULL;
-        struct glusterfs_mntent *ent = NULL;
-
-        if (!confpath)
-                goto out;
-
-        handle = glusterfs_fstab_init (confpath, "r");
-        if (!handle)
-                goto out;
-
-        while ((ent = glusterfs_fstab_getent (handle)) != NULL)
-                booster_mount (ent);
-
-        glusterfs_fstab_close (handle);
-        ret = 0;
-out:
-        return ret;
-}
-
-
diff --git a/booster/src/booster_fstab.h b/booster/src/booster_fstab.h
deleted file mode 100644
index 9bab04c5a..000000000
--- a/booster/src/booster_fstab.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Utilities for reading/writing fstab, mtab, etc.
-   Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#ifndef	GLUSTERFS_FSTAB_MNTENT_H
-#define	GLUSTERFS_FSTAB_MNTENT_H	1
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
- 
-#include "compat.h"
-
-/* General filesystem types.  */
-#define GF_MNTTYPE_IGNORE	"ignore"	/* Ignore this entry.  */
-#define GF_MNTTYPE_NFS	"nfs"		/* Network file system.  */
-#define GF_MNTTYPE_SWAP	"swap"		/* Swap device.  */
-
-
-/* Generic mount options.  */
-#define GF_MNTOPT_DEFAULTS	"defaults"	/* Use all default options.  */
-#define GF_MNTOPT_RO	        "ro"		/* Read only.  */
-#define GF_MNTOPT_RW	        "rw"		/* Read/write.  */
-#define GF_MNTOPT_SUID	        "suid"		/* Set uid allowed.  */
-#define GF_MNTOPT_NOSUID	"nosuid"	/* No set uid allowed.  */
-#define GF_MNTOPT_NOAUTO	"noauto"	/* Do not auto mount.  */
-
-
-/* Structure describing a mount table entry.  */
-struct glusterfs_mntent
-{
-        char *mnt_fsname;		/* Device or server for filesystem.  */
-        char *mnt_dir;		/* Directory mounted on.  */
-        char *mnt_type;		/* Type of filesystem: ufs, nfs, etc.  */
-        char *mnt_opts;		/* Comma-separated options for fs.  */
-        int mnt_freq;		/* Dump frequency (in days).  */
-        int mnt_passno;		/* Pass number for `fsck'.  */
-};
-
-#define GF_MNTENT_BUFSIZE       1024
-typedef struct glusterfs_fstab_handle {
-        FILE *fp;
-        char buf[GF_MNTENT_BUFSIZE];
-        struct glusterfs_mntent tmpent;
-}glusterfs_fstab_t;
-
-
-/* Prepare to begin reading and/or writing mount table entries from the
-   beginning of FILE.  MODE is as for `fopen'.  */
-extern glusterfs_fstab_t *glusterfs_fstab_init (const char *file,
-                const char *mode);
-
-extern struct glusterfs_mntent *glusterfs_fstab_getent (glusterfs_fstab_t *h);
-
-/* Write the mount table entry described by MNT to STREAM.
-   Return zero on success, nonzero on failure.  */
-extern int glusterfs_fstab_addent (glusterfs_fstab_t *h,
-                const struct glusterfs_mntent *mnt);
-
-/* Close a stream opened with `glusterfs_fstab_init'.  */
-extern int glusterfs_fstab_close (glusterfs_fstab_t *h);
-
-/* Search MNT->mnt_opts for an option matching OPT.
-   Returns the address of the substring, or null if none found.  */
-extern char *glusterfs_fstab_hasoption (const struct glusterfs_mntent *mnt,
-                const char *opt);
-
-#endif
diff --git a/booster/src/booster_stat.c b/booster/src/booster_stat.c
deleted file mode 100644
index 8f76cfe37..000000000
--- a/booster/src/booster_stat.c
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
-   Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#include <sys/types.h>
-
-extern int
-booster_stat (const char *path, void *buf);
-
-extern int
-booster_stat64 (const char *path, void *buf);
-
-extern int
-booster_xstat (int ver, const char *path, void *buf);
-
-extern int
-booster_xstat64 (int ver, const char *path, void *buf);
-
-extern int
-booster_fxstat (int ver, int fd, void *buf);
-extern int
-booster_fxstat64 (int ver, int fd, void *buf);
-extern int
-booster_fstat (int fd, void *buf);
-extern int
-booster_fstat64 (int fd, void *buf);
-
-extern int
-booster_lstat (const char *path, void *buf);
-extern int
-booster_lstat64 (const char *path, void *buf);
-extern int
-booster_lxstat (int ver, const char *path, void *buf);
-extern int
-booster_lxstat64 (int ver, const char *path, void *buf);
-
-
-extern int
-booster_statfs (const char *path, void *buf);
-extern int
-booster_statfs64 (const char *path, void *buf);
-
-extern int
-booster_statvfs (const char *path, void *buf);
-
-extern int
-booster_statvfs64 (const char *path, void *buf);
-
-extern void *
-booster_readdir (void *dir);
-
-extern void
-booster_rewinddir (void *dir);
-
-extern void
-booster_seekdir (void *dir, off_t offset);
-
-extern off_t
-booster_telldir (void *dir);
-
-int
-stat (const char *path, void *buf)
-{
-        return booster_stat (path, buf);
-}
-
-int
-stat64 (const char *path, void *buf)
-{
-        return booster_stat64 (path, buf);
-}
-
-int
-__xstat (int ver, const char *path, void *buf)
-{
-        return booster_xstat (ver, path, buf);
-}
-
-int
-__xstat64 (int ver, const char *path, void *buf)
-{
-        return booster_xstat64 (ver, path, buf);
-}
-
-int
-__fxstat (int ver, int fd, void *buf)
-{
-        return booster_fxstat (ver, fd, buf);
-}
-
-int
-__fxstat64 (int ver, int fd, void *buf)
-{
-        return booster_fxstat64 (ver, fd, buf);
-}
-
-int
-fstat (int fd, void *buf)
-{
-        return booster_fstat (fd, buf);
-}
-
-int
-fstat64 (int fd, void *buf)
-{
-        return booster_fstat64 (fd, buf);
-}
-
-int
-lstat (const char *path, void *buf)
-{
-        return booster_lstat (path, buf);
-}
-
-int
-lstat64 (const char *path, void *buf)
-{
-        return booster_lstat64 (path, buf);
-}
-
-int
-__lxstat (int ver, const char *path, void *buf)
-{
-        return booster_lxstat (ver, path, buf);
-}
-
-int
-__lxstat64 (int ver, const char *path, void *buf)
-{
-        return booster_lxstat64 (ver, path, buf);
-}
-
-int
-statfs (const char *pathname, void *buf)
-{
-        return booster_statfs (pathname, buf);
-}
-
-int
-statfs64 (const char *pathname, void *buf)
-{
-        return booster_statfs64 (pathname, buf);
-}
-
-int
-statvfs (const char *pathname, void *buf)
-{
-        return booster_statvfs (pathname, buf);
-}
-
-int
-statvfs64 (const char *pathname, void *buf)
-{
-        return booster_statvfs64 (pathname, buf);
-}
-
-void
-rewinddir (void *dir)
-{
-	return booster_rewinddir (dir);
-}
-
-void
-seekdir (void *dir, off_t offset)
-{
-	return booster_seekdir (dir, offset);
-}
-
-off_t
-telldir (void *dir)
-{
-	return booster_telldir (dir);
-}
diff --git a/cli/src/Makefile.am b/cli/src/Makefile.am
index b76a9efd3..216d1bb55 100644
--- a/cli/src/Makefile.am
+++ b/cli/src/Makefile.am
@@ -2,23 +2,26 @@ sbin_PROGRAMS = gluster
 
 gluster_SOURCES = cli.c registry.c input.c cli-cmd.c cli-rl.c \
 	 cli-cmd-volume.c cli-cmd-peer.c cli-rpc-ops.c cli-cmd-parser.c\
-	 cli-cmd-system.c cli-cmd-misc.c
+	 cli-cmd-system.c cli-cmd-misc.c cli-xml-output.c cli-cmd-snapshot.c
 
 gluster_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(GF_LDADD)\
 		$(RLLIBS) $(top_builddir)/rpc/xdr/src/libgfxdr.la \
-		$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la
+		$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
+		$(GF_GLUSTERFS_LIBS) $(XML_LIBS)
 
-gluster_LDFLAGS = $(GF_LDFLAGS) $(GF_GLUSTERFS_LDFLAGS)
+gluster_LDFLAGS = $(GF_LDFLAGS)
 noinst_HEADERS = cli.h cli-mem-types.h cli-cmd.h
 
-AM_CFLAGS = -fPIC -Wall -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS)\
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
 	-I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/rpc/rpc-lib/src\
 	-I$(top_srcdir)/rpc/xdr/src\
 	-DDATADIR=\"$(localstatedir)\" \
-	-DCONFDIR=\"$(sysconfdir)/glusterfs\" $(GF_GLUSTERFS_CFLAGS)\
+	-DCONFDIR=\"$(sysconfdir)/glusterfs\" \
 	-DGSYNCD_PREFIX=\"$(libexecdir)/glusterfs\"\
-	-DSYNCDAEMON_COMPILE=$(SYNCDAEMON_COMPILE) -DSBIN_DIR=\"$(sbindir)\"
+	-DSYNCDAEMON_COMPILE=$(SYNCDAEMON_COMPILE) -DSBIN_DIR=\"$(sbindir)\"\
+	$(XML_CPPFLAGS)
 
+AM_CFLAGS = -Wall $(GF_GLUSTERFS_CFLAGS)
 
 CLEANFILES =
 
diff --git a/cli/src/cli-cmd-misc.c b/cli/src/cli-cmd-misc.c
index 40f419cde..566d7c978 100644
--- a/cli/src/cli-cmd-misc.c
+++ b/cli/src/cli-cmd-misc.c
@@ -1,22 +1,12 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -41,6 +31,8 @@ extern struct cli_cmd volume_cmds[];
 extern struct cli_cmd cli_probe_cmds[];
 extern struct cli_cmd cli_log_cmds[];
 extern struct cli_cmd cli_system_cmds[];
+extern struct cli_cmd cli_bd_cmds[];
+extern struct cli_cmd snapshot_cmds[];
 struct cli_cmd cli_misc_cmds[];
 
 int
@@ -55,7 +47,8 @@ cli_cmd_display_help (struct cli_state *state, struct cli_cmd_word *in_word,
                       const char **words, int wordcount)
 {
         struct cli_cmd        *cmd[] = {volume_cmds, cli_probe_cmds,
-                                       cli_misc_cmds, NULL};
+                                       cli_misc_cmds, snapshot_cmds,
+                                       NULL};
         struct cli_cmd        *cmd_ind = NULL;
         int                   i = 0;
 
@@ -80,6 +73,10 @@ struct cli_cmd cli_misc_cmds[] = {
            cli_cmd_display_help,
            "display command options"},
 
+        { "exit",
+          cli_cmd_quit_cbk,
+          "exit"},
+
         { NULL, NULL, NULL }
 };
 
diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c
index d3cb1240f..5ab208b8f 100644
--- a/cli/src/cli-cmd-parser.c
+++ b/cli/src/cli-cmd-parser.c
@@ -1,22 +1,12 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -36,6 +26,37 @@
 #include "protocol-common.h"
 #include "cli1-xdr.h"
 
+#define MAX_SNAP_DESCRIPTION_LEN 1024
+
+struct snap_config_opt_vals_ snap_confopt_vals[] = {
+        {.op_name        = "snap-max-hard-limit",
+         .question       = "Changing snapshot-max-hard-limit "
+                           "will lead to deletion of snapshots "
+                           "if they exceed the new limit.\n"
+                           "Do you want to continue?"
+        },
+        {.op_name        = "snap-max-soft-limit",
+         .question       = "Changing snapshot-max-soft-limit "
+                           "will lead to deletion of snapshots "
+                           "if they exceed the new limit.\n"
+                           "Do you want to continue?"
+        },
+        {.op_name        = "both",
+        .question        = "Changing snapshot-max-hard-limit & "
+                           "snapshot-max-soft-limit will lead to "
+                           "deletion of snapshots if they exceed "
+                           "the new limit.\nDo you want to continue?"
+        },
+        {.op_name        = NULL,
+        }
+};
+
+enum cli_snap_config_set_types {
+        GF_SNAP_CONFIG_SET_HARD = 0,
+        GF_SNAP_CONFIG_SET_SOFT = 1,
+        GF_SNAP_CONFIG_SET_BOTH = 2,
+};
+typedef enum cli_snap_config_set_types cli_snap_config_set_types;
 
 static const char *
 id_sel (void *wcon)
@@ -75,21 +96,20 @@ cli_cmd_bricks_parse (const char **words, int wordcount, int brick_index,
         brick_list_len++;
         while (brick_index < wordcount) {
                 if (validate_brick_name ((char *)words[brick_index])) {
-                        cli_out ("Wrong brick type: %s, use <HOSTNAME>:"
+                        cli_err ("Wrong brick type: %s, use <HOSTNAME>:"
                                  "<export-dir-abs-path>", words[brick_index]);
                         ret = -1;
                         goto out;
                 } else {
                         delimiter = strrchr (words[brick_index], ':');
-                        ret = cli_canonicalize_path (delimiter + 1);
+                        ret = gf_canonicalize_path (delimiter + 1);
                         if (ret)
                                 goto out;
                 }
 
                 if ((brick_list_len + strlen (words[brick_index]) + 1) > sizeof (brick_list)) {
-                        gf_log ("cli", GF_LOG_ERROR,
-                                "total brick list is larger than a request "
-                                "can take (brick_count %d)", *brick_count);
+                        cli_err ("Total brick list is larger than a request. "
+                                 "Can take (brick_count %d)", *brick_count);
                         ret = -1;
                         goto out;
                 }
@@ -109,16 +129,18 @@ cli_cmd_bricks_parse (const char **words, int wordcount, int brick_index,
                 }
 
                 if (!(strcmp (host_name, "localhost") &&
-                      strcmp (host_name, "127.0.0.1"))) {
-                        cli_out ("Please provide a valid hostname/ip other "
-                                 "than localhost or 127.0.0.1");
+                      strcmp (host_name, "127.0.0.1") &&
+                      strncmp (host_name, "0.", 2))) {
+                        cli_err ("Please provide a valid hostname/ip other "
+                                 "than localhost, 127.0.0.1 or loopback "
+                                 "address (0.0.0.0 to 0.255.255.255).");
                         ret = -1;
                         GF_FREE (tmp_host);
                         goto out;
                 }
-                if (!valid_internet_address (host_name)) {
-                        cli_out ("internet address '%s' does not comform to "
-			         "standards", host_name);
+                if (!valid_internet_address (host_name, _gf_false)) {
+                        cli_err ("internet address '%s' does not conform to "
+                                 "standards", host_name);
                 }
                 GF_FREE (tmp_host);
                 tmp_list = gf_strdup (brick_list + 1);
@@ -132,7 +154,7 @@ cli_cmd_bricks_parse (const char **words, int wordcount, int brick_index,
                         strtok_r (tmp_list, " ", &tmpptr);
                         if (!(strcmp (tmp_list, words[brick_index]))) {
                                 ret = -1;
-                                cli_out ("Found duplicate"
+                                cli_err ("Found duplicate"
                                          " exports %s",words[brick_index]);
                                 goto out;
                        }
@@ -150,8 +172,7 @@ cli_cmd_bricks_parse (const char **words, int wordcount, int brick_index,
         if (!*bricks)
                 ret = -1;
 out:
-        if (free_list_ptr)
-                GF_FREE (free_list_ptr);
+        GF_FREE (free_list_ptr);
         return ret;
 }
 
@@ -171,10 +192,18 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
         char    *bricks = NULL;
         int32_t brick_count = 0;
         char    *opwords[] = { "replica", "stripe", "transport", NULL };
+
+        char    *invalid_volnames[] = {"volume", "type", "subvolumes", "option",
+                                       "end-volume", "all", "volume_not_in_ring",
+                                       "description", "force",
+                                       "snap-max-hard-limit",
+                                       "snap-max-soft-limit", NULL};
         char    *w = NULL;
         int      op_count = 0;
         int32_t  replica_count = 1;
         int32_t  stripe_count = 1;
+        gf_boolean_t is_force = _gf_false;
+        int wc = wordcount;
 
         GF_ASSERT (words);
         GF_ASSERT (options);
@@ -196,9 +225,12 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
                 if (volname[0] == '-')
                         goto out;
 
-                if (!strcmp (volname, "all")) {
-                        cli_out ("\"all\" cannot be the name of a volume.");
-                        goto out;
+                for (i = 0; invalid_volnames[i]; i++) {
+                        if (!strcmp (volname, invalid_volnames[i])) {
+                                cli_err ("\"%s\" cannot be the name of a volume.",
+                                         volname);
+                                goto out;
+                        }
                 }
 
                 if (strchr (volname, '/'))
@@ -229,7 +261,7 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
                         switch (type) {
                         case GF_CLUSTER_TYPE_STRIPE_REPLICATE:
                         case GF_CLUSTER_TYPE_REPLICATE:
-                                cli_out ("replica option given twice");
+                                cli_err ("replica option given twice");
                                 goto out;
                         case GF_CLUSTER_TYPE_NONE:
                                 type = GF_CLUSTER_TYPE_REPLICATE;
@@ -245,7 +277,8 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
                         }
                         replica_count = strtol (words[index+1], NULL, 0);
                         if (replica_count < 2) {
-                                cli_out ("replica count should be greater than 1");
+                                cli_err ("replica count should be greater"
+                                         " than 1");
                                 ret = -1;
                                 goto out;
                         }
@@ -259,7 +292,7 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
                         switch (type) {
                         case GF_CLUSTER_TYPE_STRIPE_REPLICATE:
                         case GF_CLUSTER_TYPE_STRIPE:
-                                cli_out ("stripe option given twice");
+                                cli_err ("stripe option given twice");
                                 goto out;
                         case GF_CLUSTER_TYPE_NONE:
                                 type = GF_CLUSTER_TYPE_STRIPE;
@@ -274,7 +307,8 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
                         }
                         stripe_count = strtol (words[index+1], NULL, 0);
                         if (stripe_count < 2) {
-                                cli_out ("stripe count should be greater than 1");
+                                cli_err ("stripe count should be greater"
+                                         " than 1");
                                 ret = -1;
                                 goto out;
                         }
@@ -286,7 +320,7 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
 
                 } else if ((strcmp (w, "transport")) == 0) {
                         if (trans_type) {
-                                cli_out ("'transport' option given more"
+                                cli_err ("'transport' option given more"
                                          " than one time");
                                 goto out;
                         }
@@ -304,7 +338,7 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
                                 goto out;
                         }
                         index += 2;
-                } else {
+                }              else {
                         GF_ASSERT (!"opword mismatch");
                         ret = -1;
                         goto out;
@@ -327,7 +361,12 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
 
         brick_index = index;
 
-        ret = cli_cmd_bricks_parse (words, wordcount, brick_index, &bricks,
+        if (strcmp (words[wordcount - 1], "force") == 0) {
+                is_force = _gf_true;
+                wc = wordcount - 1;
+        }
+
+        ret = cli_cmd_bricks_parse (words, wc, brick_index, &bricks,
                                     &brick_count);
         if (ret)
                 goto out;
@@ -335,20 +374,20 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
         /* If brick-count is not valid when replica or stripe is
            given, exit here */
         if (!brick_count) {
-                cli_out ("No bricks specified");
+                cli_err ("No bricks specified");
                 ret = -1;
                 goto out;
         }
 
         if (brick_count % sub_count) {
                 if (type == GF_CLUSTER_TYPE_STRIPE)
-                        cli_out ("number of bricks is not a multiple of "
+                        cli_err ("number of bricks is not a multiple of "
                                  "stripe count");
                 else if (type == GF_CLUSTER_TYPE_REPLICATE)
-                        cli_out ("number of bricks is not a multiple of "
+                        cli_err ("number of bricks is not a multiple of "
                                  "replica count");
                 else
-                        cli_out ("number of bricks given doesn't match "
+                        cli_err ("number of bricks given doesn't match "
                                  "required count");
 
                 ret = -1;
@@ -367,7 +406,7 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
         ret = dict_set_dynstr (dict, "transport", trans_type);
         if (ret)
                 goto out;
-
+        trans_type = NULL;
 
         ret = dict_set_dynstr (dict, "bricks", bricks);
         if (ret)
@@ -377,6 +416,10 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options
         if (ret)
                 goto out;
 
+        ret = dict_set_int32 (dict, "force", is_force);
+        if (ret)
+                goto out;
+
         *options = dict;
 
 out:
@@ -385,6 +428,9 @@ out:
                 if (dict)
                         dict_destroy (dict);
         }
+
+        GF_FREE (trans_type);
+
         return ret;
 }
 
@@ -498,7 +544,7 @@ cli_cmd_quota_parse (const char **words, int wordcount, dict_t **options)
                         goto out;
 
                 if (!strcmp (volname, "all")) {
-                        cli_out ("\"all\" cannot be the name of a volume.");
+                        cli_err ("\"all\" cannot be the name of a volume.");
                         goto out;
                 }
 
@@ -523,16 +569,26 @@ cli_cmd_quota_parse (const char **words, int wordcount, dict_t **options)
                 goto out;
         }
 
-        if ((strcmp (w, "enable")) == 0 && wordcount == 4) {
-                type = GF_QUOTA_OPTION_TYPE_ENABLE;
-                ret = 0;
-                goto set_type;
+        if (strcmp (w, "enable") == 0) {
+                if (wordcount == 4) {
+                        type = GF_QUOTA_OPTION_TYPE_ENABLE;
+                        ret = 0;
+                        goto set_type;
+                } else {
+                        ret = -1;
+                        goto out;
+                }
         }
 
-        if (strcmp (w, "disable") == 0 && wordcount == 4) {
-                type = GF_QUOTA_OPTION_TYPE_DISABLE;
-                ret = 0;
-                goto set_type;
+        if (strcmp (w, "disable") == 0) {
+                if (wordcount == 4) {
+                        type = GF_QUOTA_OPTION_TYPE_DISABLE;
+                        ret = 0;
+                        goto set_type;
+                } else {
+                        ret = -1;
+                        goto out;
+                }
         }
 
         if (strcmp (w, "limit-usage") == 0) {
@@ -544,7 +600,7 @@ cli_cmd_quota_parse (const char **words, int wordcount, dict_t **options)
                 type = GF_QUOTA_OPTION_TYPE_LIMIT_USAGE;
 
                 if (words[4][0] != '/') {
-                        cli_out ("Please enter absolute path");
+                        cli_err ("Please enter absolute path");
 
                         return -2;
                 }
@@ -553,14 +609,14 @@ cli_cmd_quota_parse (const char **words, int wordcount, dict_t **options)
                         goto out;
 
                 if (!words[5]) {
-                        cli_out ("Please enter the limit value to be set");
+                        cli_err ("Please enter the limit value to be set");
 
                         return -2;
                 }
 
                 ret = gf_string2bytesize (words[5], &value);
                 if (ret != 0) {
-                        cli_out ("Please enter a correct value");
+                        cli_err ("Please enter a correct value");
                         return -1;
                 }
 
@@ -579,7 +635,7 @@ cli_cmd_quota_parse (const char **words, int wordcount, dict_t **options)
                 type = GF_QUOTA_OPTION_TYPE_REMOVE;
 
                 if (words[4][0] != '/') {
-                        cli_out ("Please enter absolute path");
+                        cli_err ("Please enter absolute path");
 
                         return -2;
                 }
@@ -635,17 +691,117 @@ out:
         return ret;
 }
 
+static inline gf_boolean_t
+cli_is_key_spl (char *key)
+{
+        return (strcmp (key, "group") == 0);
+}
+
+#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd"
+static int
+cli_add_key_group (dict_t *dict, char *key, char *value, char **op_errstr)
+{
+        int             ret = -1;
+        int             opt_count = 0;
+        char            iter_key[1024] = {0,};
+        char            iter_val[1024] = {0,};
+        char            *saveptr = NULL;
+        char            *tok_key = NULL;
+        char            *tok_val = NULL;
+        char            *dkey = NULL;
+        char            *dval = NULL;
+        char            *tagpath = NULL;
+        char            *buf = NULL;
+        char            line[PATH_MAX + 256] = {0,};
+        char            errstr[2048] = "";
+        FILE            *fp = NULL;
+
+        ret = gf_asprintf (&tagpath, "%s/groups/%s",
+                           GLUSTERD_DEFAULT_WORKDIR, value);
+        if (ret == -1) {
+                tagpath = NULL;
+                goto out;
+        }
+
+        fp = fopen (tagpath, "r");
+        if (!fp) {
+                ret = -1;
+                snprintf(errstr, sizeof(errstr), "Unable to open file '%s'."
+                         " Error: %s", tagpath, strerror (errno));
+                if (op_errstr)
+                        *op_errstr = gf_strdup(errstr);
+                goto out;
+        }
+
+        opt_count = 0;
+        buf = line;
+        while (fscanf (fp, "%s", buf) != EOF) {
+
+                opt_count++;
+                tok_key = strtok_r (line, "=", &saveptr);
+                tok_val = strtok_r (NULL, "=", &saveptr);
+                if (!tok_key || !tok_val) {
+                        ret = -1;
+                        snprintf(errstr, sizeof(errstr), "'%s' file format "
+                                 "not valid.", tagpath);
+                        if (op_errstr)
+                                *op_errstr = gf_strdup(errstr);
+                        goto out;
+                }
+
+                snprintf (iter_key, sizeof (iter_key), "key%d", opt_count);
+                dkey = gf_strdup (tok_key);
+                ret = dict_set_dynstr (dict, iter_key, dkey);
+                if (ret)
+                        goto out;
+                dkey = NULL;
+
+                snprintf (iter_val, sizeof (iter_val), "value%d", opt_count);
+                dval = gf_strdup (tok_val);
+                ret = dict_set_dynstr (dict, iter_val, dval);
+                if (ret)
+                        goto out;
+                dval = NULL;
+
+        }
+
+        if (!opt_count) {
+                ret = -1;
+                snprintf(errstr, sizeof(errstr), "'%s' file format "
+                         "not valid.", tagpath);
+                if (op_errstr)
+                        *op_errstr = gf_strdup(errstr);
+                goto out;
+        }
+        ret = dict_set_int32 (dict, "count", opt_count);
+out:
+
+        GF_FREE (tagpath);
+
+        if (ret) {
+                GF_FREE (dkey);
+                GF_FREE (dval);
+        }
+
+        if (fp)
+                fclose (fp);
+
+        return ret;
+}
+#undef GLUSTERD_DEFAULT_WORKDIR
+
 int32_t
-cli_cmd_volume_set_parse (const char **words, int wordcount, dict_t **options)
+cli_cmd_volume_set_parse (const char **words, int wordcount, dict_t **options,
+                          char **op_errstr)
 {
-        dict_t  *dict = NULL;
-        char    *volname = NULL;
-        int     ret = -1;
-        int     count = 0;
-        char    *key = NULL;
-        char    *value = NULL;
-        int     i = 0;
-        char    str[50] = {0,};
+        dict_t                  *dict = NULL;
+        char                    *volname = NULL;
+        int                     ret = -1;
+        int                     count = 0;
+        char                    *key = NULL;
+        char                    *value = NULL;
+        int                     i = 0;
+        char                    str[50] = {0,};
 
         GF_ASSERT (words);
         GF_ASSERT (options);
@@ -667,28 +823,65 @@ cli_cmd_volume_set_parse (const char **words, int wordcount, dict_t **options)
         if (ret)
                 goto out;
 
-        if (!strcmp (volname, "help") && wordcount == 3 )
-                ret = dict_set_str (dict, "help", volname);
-
-        if (!strcmp (volname, "help-xml") && wordcount == 3 )
-                ret = dict_set_str (dict, "help-xml", volname);
+        if ((!strcmp (volname, "help") || !strcmp (volname, "help-xml"))
+            && wordcount == 3 ) {
+                ret = dict_set_str (dict, volname, volname);
+                if (ret)
+                        goto out;
 
-        if (ret)
+        } else if (wordcount < 5) {
+                ret = -1;
                 goto out;
 
+        } else if (wordcount == 5  && cli_is_key_spl ((char *)words[3])) {
+                key = (char *) words[3];
+                value = (char *) words[4];
+                if ( !key || !value) {
+                        ret = -1;
+                        goto out;
+                }
+
+                ret = gf_strip_whitespace (value, strlen (value));
+                if (ret == -1)
+                        goto out;
+
+                if (strlen (value) == 0) {
+                        ret = -1;
+                        goto out;
+                }
+
+                ret = cli_add_key_group (dict, key, value, op_errstr);
+                if (ret == 0)
+                        *options = dict;
+                goto out;
+        }
 
         for (i = 3; i < wordcount; i+=2) {
 
-		key = (char *) words[i];
-		value = (char *) words[i+1];
+                key = (char *) words[i];
+                value = (char *) words[i+1];
 
-		if ( !key || !value) {
-			ret = -1;
-			goto out;
-        	}
+                if ( !key || !value) {
+                        ret = -1;
+                        goto out;
+                }
 
                 count++;
 
+                ret = gf_strip_whitespace (value, strlen (value));
+                if (ret == -1)
+                        goto out;
+
+                if (strlen (value) == 0) {
+                        ret = -1;
+                        goto out;
+                }
+
+                if (cli_is_key_spl (key)) {
+                        ret = -1;
+                        goto out;
+                }
+
                 sprintf (str, "key%d", count);
                 ret = dict_set_str (dict, str, key);
                 if (ret)
@@ -709,10 +902,8 @@ cli_cmd_volume_set_parse (const char **words, int wordcount, dict_t **options)
         *options = dict;
 
 out:
-        if (ret) {
-                if (dict)
-                        dict_destroy (dict);
-        }
+        if (ret)
+                dict_destroy (dict);
 
         return ret;
 }
@@ -726,6 +917,13 @@ cli_cmd_volume_add_brick_parse (const char **words, int wordcount,
         int     ret = -1;
         int     brick_count = 0, brick_index = 0;
         char    *bricks = NULL;
+        char    *opwords_cl[] = { "replica", "stripe", NULL };
+        gf1_cluster_type type = GF_CLUSTER_TYPE_NONE;
+        int     count = 1;
+        char    *w = NULL;
+        int     index;
+        gf_boolean_t is_force = _gf_false;
+        int wc = wordcount;
 
         GF_ASSERT (words);
         GF_ASSERT (options);
@@ -751,9 +949,65 @@ cli_cmd_volume_add_brick_parse (const char **words, int wordcount,
                 ret = -1;
                 goto out;
         }
+        if (wordcount < 6) {
+                /* seems no options are given, go directly to the parse_brick */
+                brick_index = 3;
+                type = GF_CLUSTER_TYPE_NONE;
+                goto parse_bricks;
+        }
 
-        brick_index = 3;
-        ret = cli_cmd_bricks_parse (words, wordcount, brick_index, &bricks,
+        w = str_getunamb (words[3], opwords_cl);
+        if (!w) {
+                type = GF_CLUSTER_TYPE_NONE;
+                index = 3;
+        } else if ((strcmp (w, "replica")) == 0) {
+                type = GF_CLUSTER_TYPE_REPLICATE;
+                if (wordcount < 5) {
+                        ret = -1;
+                        goto out;
+                }
+                count = strtol (words[4], NULL, 0);
+                if (!count || (count < 2)) {
+                        cli_err ("replica count should be greater than 1");
+                        ret = -1;
+                        goto out;
+                }
+                ret = dict_set_int32 (dict, "replica-count", count);
+                if (ret)
+                        goto out;
+                index = 5;
+        } else if ((strcmp (w, "stripe")) == 0) {
+                type = GF_CLUSTER_TYPE_STRIPE;
+                if (wordcount < 5) {
+                        ret = -1;
+                        goto out;
+                }
+                count = strtol (words[4], NULL, 0);
+                if (!count || (count < 2)) {
+                        cli_err ("stripe count should be greater than 1");
+                        ret = -1;
+                        goto out;
+                }
+                ret = dict_set_int32 (dict, "stripe-count", count);
+                if (ret)
+                        goto out;
+                index = 5;
+        } else {
+                GF_ASSERT (!"opword mismatch");
+                ret = -1;
+                goto out;
+        }
+
+        brick_index = index;
+
+parse_bricks:
+
+        if (strcmp (words[wordcount - 1], "force") == 0) {
+                is_force = _gf_true;
+                wc = wordcount - 1;
+        }
+
+        ret = cli_cmd_bricks_parse (words, wc, brick_index, &bricks,
                                     &brick_count);
         if (ret)
                 goto out;
@@ -767,6 +1021,10 @@ cli_cmd_volume_add_brick_parse (const char **words, int wordcount,
         if (ret)
                 goto out;
 
+        ret = dict_set_int32 (dict, "force", is_force);
+        if (ret)
+                goto out;
+
         *options = dict;
 
 out:
@@ -794,20 +1052,21 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount,
         int32_t j = 0;
         char    *tmp_brick = NULL;
         char    *tmp_brick1 = NULL;
-        char    *opwords[] = { "start", "commit", "pause", "abort", "status",
+        char    *type_opword[] = { "replica", NULL };
+        char    *opwords[] = { "start", "commit", "stop", "status",
                                "force", NULL };
         char    *w = NULL;
         int32_t  command = GF_OP_CMD_NONE;
+        long     count = 0;
 
         GF_ASSERT (words);
         GF_ASSERT (options);
 
-        dict = dict_new ();
-
-        if (!dict)
+        if (wordcount < 4)
                 goto out;
 
-        if (wordcount < 3)
+        dict = dict_new ();
+        if (!dict)
                 goto out;
 
         volname = (char *)words[2];
@@ -818,6 +1077,29 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount,
         if (ret)
                 goto out;
 
+        brick_index = 3;
+        w = str_getunamb (words[3], type_opword);
+        if (w && !strcmp ("replica", w)) {
+                if (wordcount < 5) {
+                        ret = -1;
+                        goto out;
+                }
+                count = strtol (words[4], NULL, 0);
+                if (count < 1) {
+                        cli_err ("replica count should be greater than 0 in "
+                                 "case of remove-brick");
+                        ret = -1;
+                        goto out;
+                }
+
+                ret = dict_set_int32 (dict, "replica-count", count);
+                if (ret)
+                        goto out;
+                brick_index = 5;
+        } else if (w) {
+                GF_ASSERT (!"opword mismatch");
+        }
+
         w = str_getunamb (words[wordcount - 1], opwords);
         if (!w) {
                 /* Should be default 'force' */
@@ -833,10 +1115,8 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount,
                         command = GF_OP_CMD_COMMIT;
                         if (question)
                                 *question = 1;
-                } else if (!strcmp ("pause", w)) {
-                        command = GF_OP_CMD_PAUSE;
-                } else if (!strcmp ("abort", w)) {
-                        command = GF_OP_CMD_ABORT;
+                } else if (!strcmp ("stop", w)) {
+                        command = GF_OP_CMD_STOP;
                 } else if (!strcmp ("status", w)) {
                         command = GF_OP_CMD_STATUS;
                 } else if (!strcmp ("force", w)) {
@@ -861,8 +1141,6 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount,
                         command);
 
 
-        brick_index = 3;
-
         tmp_index = brick_index;
         tmp_brick = GF_MALLOC(2048 * sizeof(*tmp_brick), gf_common_mt_char);
 
@@ -884,13 +1162,13 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount,
 
         while (brick_index < wordcount) {
                 if (validate_brick_name ((char *)words[brick_index])) {
-                        cli_out ("wrong brick type: %s, use <HOSTNAME>:"
+                        cli_err ("wrong brick type: %s, use <HOSTNAME>:"
                                  "<export-dir-abs-path>", words[brick_index]);
                         ret = -1;
                         goto out;
                 } else {
                         delimiter = strrchr(words[brick_index], ':');
-                        ret = cli_canonicalize_path (delimiter + 1);
+                        ret = gf_canonicalize_path (delimiter + 1);
                         if (ret)
                                 goto out;
                 }
@@ -902,7 +1180,7 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount,
                         if (!(strcmp (tmp_brick, tmp_brick1))) {
                                 gf_log("",GF_LOG_ERROR, "Duplicate bricks"
                                        " found %s", words[brick_index]);
-                                cli_out("Duplicate bricks found %s",
+                                cli_err("Duplicate bricks found %s",
                                         words[brick_index]);
                                 ret = -1;
                                 goto out;
@@ -929,10 +1207,8 @@ out:
                         dict_destroy (dict);
         }
 
-        if (tmp_brick)
-                GF_FREE (tmp_brick);
-        if (tmp_brick1)
-                GF_FREE (tmp_brick1);
+        GF_FREE (tmp_brick);
+        GF_FREE (tmp_brick1);
 
         return ret;
 }
@@ -951,6 +1227,7 @@ cli_cmd_volume_replace_brick_parse (const char **words, int wordcount,
         char    *opwords[] = { "start", "commit", "pause", "abort", "status",
                                 NULL };
         char    *w = NULL;
+        gf_boolean_t is_force = _gf_false;
 
         GF_ASSERT (words);
         GF_ASSERT (options);
@@ -978,13 +1255,13 @@ cli_cmd_volume_replace_brick_parse (const char **words, int wordcount,
         }
 
         if (validate_brick_name ((char *)words[3])) {
-                cli_out ("wrong brick type: %s, use "
+                cli_err ("wrong brick type: %s, use "
                          "<HOSTNAME>:<export-dir-abs-path>", words[3]);
                 ret = -1;
                 goto out;
         } else {
                 delimiter = strrchr ((char *)words[3], ':');
-                ret = cli_canonicalize_path (delimiter + 1);
+                ret = gf_canonicalize_path (delimiter + 1);
                 if (ret)
                         goto out;
         }
@@ -999,13 +1276,13 @@ cli_cmd_volume_replace_brick_parse (const char **words, int wordcount,
         }
 
         if (validate_brick_name ((char *)words[4])) {
-                cli_out ("wrong brick type: %s, use "
+                cli_err ("wrong brick type: %s, use "
                          "<HOSTNAME>:<export-dir-abs-path>", words[4]);
                 ret = -1;
                 goto out;
         } else {
                 delimiter = strrchr ((char *)words[4], ':');
-                ret = cli_canonicalize_path (delimiter + 1);
+                ret = gf_canonicalize_path (delimiter + 1);
                 if (ret)
                         goto out;
         }
@@ -1039,6 +1316,7 @@ cli_cmd_volume_replace_brick_parse (const char **words, int wordcount,
                 GF_ASSERT (!"opword mismatch");
 
         /* commit force option */
+
         op_index = 6;
 
         if (wordcount > (op_index + 1)) {
@@ -1047,8 +1325,17 @@ cli_cmd_volume_replace_brick_parse (const char **words, int wordcount,
         }
 
         if (wordcount == (op_index + 1)) {
+                if ((replace_op != GF_REPLACE_OP_COMMIT) &&
+                    (replace_op != GF_REPLACE_OP_START)) {
+                        ret = -1;
+                        goto out;
+                }
                 if (!strcmp ("force", words[op_index])) {
-                        replace_op = GF_REPLACE_OP_COMMIT_FORCE;
+                        if (replace_op == GF_REPLACE_OP_COMMIT)
+                                replace_op = GF_REPLACE_OP_COMMIT_FORCE;
+
+                        else if (replace_op == GF_REPLACE_OP_START)
+                                is_force = _gf_true;
                 }
         }
 
@@ -1062,14 +1349,15 @@ cli_cmd_volume_replace_brick_parse (const char **words, int wordcount,
         if (ret)
                 goto out;
 
-
-
+        ret = dict_set_int32 (dict, "force", is_force);
+        if (ret)
+                goto out;
 
         *options = dict;
 
 out:
         if (ret) {
-                gf_log ("cli", GF_LOG_ERROR, "Unable to parse remove-brick CLI");
+                gf_log ("cli", GF_LOG_ERROR, "Unable to parse replace-brick CLI");
                 if (dict)
                         dict_destroy (dict);
         }
@@ -1105,12 +1393,12 @@ cli_cmd_log_filename_parse (const char **words, int wordcount, dict_t **options)
                 delimiter = strchr (words[4], ':');
                 if (!delimiter || delimiter == words[4]
                     || *(delimiter+1) != '/') {
-                        cli_out ("wrong brick type: %s, use <HOSTNAME>:"
+                        cli_err ("wrong brick type: %s, use <HOSTNAME>:"
                                  "<export-dir-abs-path>", words[4]);
                         ret = -1;
                         goto out;
                 } else {
-                        ret = cli_canonicalize_path (delimiter + 1);
+                        ret = gf_canonicalize_path (delimiter + 1);
                         if (ret)
                                 goto out;
                 }
@@ -1160,8 +1448,8 @@ cli_cmd_log_level_parse (const char **words, int worcount, dict_t **options)
 
         ret = glusterd_check_log_level(words[5]);
         if (ret == -1) {
-                cli_out("Invalid log level [%s] specified", words[5]);
-                cli_out("Valid values for loglevel: (DEBUG|WARNING|ERROR"
+                cli_err("Invalid log level [%s] specified", words[5]);
+                cli_err("Valid values for loglevel: (DEBUG|WARNING|ERROR"
                         "|CRITICAL|NONE|TRACE)");
                 goto out;
         }
@@ -1221,12 +1509,12 @@ cli_cmd_log_locate_parse (const char **words, int wordcount, dict_t **options)
                 delimiter = strchr (words[4], ':');
                 if (!delimiter || delimiter == words[4]
                     || *(delimiter+1) != '/') {
-                        cli_out ("wrong brick type: %s, use <HOSTNAME>:"
+                        cli_err ("wrong brick type: %s, use <HOSTNAME>:"
                                  "<export-dir-abs-path>", words[4]);
                         ret = -1;
                         goto out;
                 } else {
-                        ret = cli_canonicalize_path (delimiter + 1);
+                        ret = gf_canonicalize_path (delimiter + 1);
                         if (ret)
                                 goto out;
                 }
@@ -1272,12 +1560,12 @@ cli_cmd_log_rotate_parse (const char **words, int wordcount, dict_t **options)
                 delimiter = strchr (words[4], ':');
                 if (!delimiter || delimiter == words[4]
                     || *(delimiter+1) != '/') {
-                        cli_out ("wrong brick type: %s, use <HOSTNAME>:"
+                        cli_err ("wrong brick type: %s, use <HOSTNAME>:"
                                  "<export-dir-abs-path>", words[4]);
                         ret = -1;
                         goto out;
                 } else {
-                        ret = cli_canonicalize_path (delimiter + 1);
+                        ret = gf_canonicalize_path (delimiter + 1);
                         if (ret)
                                 goto out;
                 }
@@ -1308,22 +1596,161 @@ gsyncd_glob_check (const char *w)
         return !!strpbrk (w, "*?[");
 }
 
+static int
+config_parse (const char **words, int wordcount, dict_t *dict,
+              unsigned cmdi, unsigned glob)
+{
+        int32_t            ret     = -1;
+        int32_t            i       = -1;
+        char               *append_str = NULL;
+        size_t             append_len = 0;
+        char               *subop = NULL;
+
+        switch ((wordcount - 1) - cmdi) {
+        case 0:
+                subop = gf_strdup ("get-all");
+                break;
+        case 1:
+                if (words[cmdi + 1][0] == '!') {
+                        (words[cmdi + 1])++;
+                        if (gf_asprintf (&subop, "del%s",
+                                         glob ? "-glob" : "") == -1)
+                                subop = NULL;
+                } else
+                        subop = gf_strdup ("get");
+
+                ret = dict_set_str (dict, "op_name", ((char *)words[cmdi + 1]));
+                if (ret < 0)
+                        goto out;
+                break;
+        default:
+                if (gf_asprintf (&subop, "set%s", glob ? "-glob" : "") == -1)
+                        subop = NULL;
+
+                ret = dict_set_str (dict, "op_name", ((char *)words[cmdi + 1]));
+                if (ret < 0)
+                        goto out;
+
+                /* join the varargs by spaces to get the op_value */
+
+                for (i = cmdi + 2; i < wordcount; i++)
+                        append_len += (strlen (words[i]) + 1);
+                /* trailing strcat will add two bytes, make space for that */
+                append_len++;
+
+                append_str = GF_CALLOC (1, append_len, cli_mt_append_str);
+                if (!append_str) {
+                        ret = -1;
+                        goto out;
+                }
+
+                for (i = cmdi + 2; i < wordcount; i++) {
+                        strcat (append_str, words[i]);
+                        strcat (append_str, " ");
+                }
+                append_str[append_len - 2] = '\0';
+                /* "checkpoint now" is special: we resolve that "now" */
+                if (strcmp (words[cmdi + 1], "checkpoint") == 0 &&
+                    strcmp (append_str, "now") == 0) {
+                        struct timeval tv = {0,};
+
+                        ret = gettimeofday (&tv, NULL);
+                        if (ret == -1)
+                                goto out; /* FIXME: free append_str? */
+
+                        GF_FREE (append_str);
+                        append_str = GF_CALLOC (1, 300, cli_mt_append_str);
+                        if (!append_str) {
+                                ret = -1;
+                                goto out;
+                        }
+                        strcpy (append_str, "as of ");
+                        gf_time_fmt (append_str + strlen ("as of "),
+                                     300 - strlen ("as of "),
+                                     tv.tv_sec, gf_timefmt_FT);
+                }
+
+                ret = dict_set_dynstr (dict, "op_value", append_str);
+        }
+
+        ret = -1;
+        if (subop) {
+                ret = dict_set_dynstr (dict, "subop", subop);
+                if (!ret)
+                      subop = NULL;
+        }
+
+out:
+        if (ret && append_str)
+                GF_FREE (append_str);
+
+        GF_FREE (subop);
+
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+static int32_t
+force_push_pem_parse (const char **words, int wordcount,
+                      dict_t *dict, unsigned *cmdi)
+{
+        int32_t            ret     = 0;
+
+        if (!strcmp ((char *)words[wordcount-1], "force")) {
+                if ((strcmp ((char *)words[wordcount-2], "start")) &&
+                    (strcmp ((char *)words[wordcount-2], "stop")) &&
+                    (strcmp ((char *)words[wordcount-2], "create")) &&
+                    (strcmp ((char *)words[wordcount-2], "push-pem"))) {
+                        ret = -1;
+                        goto out;
+                }
+                ret = dict_set_uint32 (dict, "force",
+                                       _gf_true);
+                if (ret)
+                        goto out;
+                (*cmdi)++;
+
+                if (!strcmp ((char *)words[wordcount-2], "push-pem")) {
+                        if (strcmp ((char *)words[wordcount-3], "create")) {
+                                ret = -1;
+                                goto out;
+                        }
+                        ret = dict_set_int32 (dict, "push_pem", 1);
+                        if (ret)
+                                goto out;
+                        (*cmdi)++;
+                }
+        } else if (!strcmp ((char *)words[wordcount-1], "push-pem")) {
+                if (strcmp ((char *)words[wordcount-2], "create")) {
+                        ret = -1;
+                        goto out;
+                }
+                ret = dict_set_int32 (dict, "push_pem", 1);
+                if (ret)
+                        goto out;
+                (*cmdi)++;
+        }
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+
 int32_t
 cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
 {
         int32_t            ret     = -1;
         dict_t             *dict   = NULL;
         gf1_cli_gsync_set  type    = GF_GSYNC_OPTION_TYPE_NONE;
-        char               *append_str = NULL;
-        size_t             append_len = 0;
-        char               *subop = NULL;
         int                i       = 0;
         unsigned           masteri = 0;
         unsigned           slavei  = 0;
         unsigned           glob    = 0;
         unsigned           cmdi    = 0;
-        char               *opwords[] = { "status", "start", "stop", "config",
-                                          NULL };
+        char               *opwords[] = { "create", "status", "start", "stop",
+                                          "config", "force", "delete",
+                                          "push-pem", "detail", NULL };
         char               *w = NULL;
 
         GF_ASSERT (words);
@@ -1335,9 +1762,11 @@ cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
 
         /* new syntax:
          *
-         * volume geo-replication [$m [$s]] status
+         * volume geo-replication $m $s create [push-pem] [force]
+         * volume geo-replication [$m [$s]] status [detail]
          * volume geo-replication [$m] $s config [[!]$opt [$val]]
-         * volume geo-replication $m $s start|stop
+         * volume geo-replication $m $s start|stop [force]
+         * volume geo-replication $m $s delete
          */
 
         if (wordcount < 3)
@@ -1368,6 +1797,13 @@ cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
                 if (slavei == 3)
                         masteri = 2;
         } else if (i <= 3) {
+                if (!strcmp ((char *)words[wordcount-1], "detail")) {
+                        /* For status detail it is mandatory to provide
+                         * both master and slave */
+                        ret = -1;
+                        goto out;
+                }
+
                 /* no $s, can only be status cmd
                  * (with either a single $m before it or nothing)
                  * -- these conditions imply that i <= 3 after
@@ -1394,7 +1830,12 @@ cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
         if (!w)
                 goto out;
 
-        if (strcmp (w, "status") == 0) {
+        if (strcmp (w, "create") == 0) {
+                type = GF_GSYNC_OPTION_TYPE_CREATE;
+
+                if (!masteri || !slavei)
+                        goto out;
+        } else if (strcmp (w, "status") == 0) {
                 type = GF_GSYNC_OPTION_TYPE_STATUS;
 
                 if (slavei && !masteri)
@@ -1414,9 +1855,33 @@ cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
 
                 if (!masteri || !slavei)
                         goto out;
+        } else if (strcmp (w, "delete") == 0) {
+                type = GF_GSYNC_OPTION_TYPE_DELETE;
+
+                if (!masteri || !slavei)
+                        goto out;
         } else
                 GF_ASSERT (!"opword mismatch");
 
+        ret = force_push_pem_parse (words, wordcount, dict, &cmdi);
+        if (ret)
+                goto out;
+
+        if (!strcmp ((char *)words[wordcount-1], "detail")) {
+                if (strcmp ((char *)words[wordcount-2], "status")) {
+                        ret = -1;
+                        goto out;
+                }
+                if (!slavei || !masteri) {
+                        ret = -1;
+                        goto out;
+                }
+                ret = dict_set_uint32 (dict, "status-detail", _gf_true);
+                if (ret)
+                        goto out;
+                cmdi++;
+        }
+
         if (type != GF_GSYNC_OPTION_TYPE_CONFIG &&
             (cmdi < wordcount - 1 || glob))
                 goto out;
@@ -1425,72 +1890,27 @@ cli_cmd_gsync_set_parse (const char **words, int wordcount, dict_t **options)
 
         ret = 0;
 
-        if (masteri)
+        if (masteri) {
                 ret = dict_set_str (dict, "master", (char *)words[masteri]);
+                if (!ret)
+                        ret = dict_set_str (dict, "volname",
+                                            (char *)words[masteri]);
+        }
         if (!ret && slavei)
                 ret = dict_set_str (dict, "slave", (char *)words[slavei]);
         if (!ret)
                 ret = dict_set_int32 (dict, "type", type);
-        if (!ret && type == GF_GSYNC_OPTION_TYPE_CONFIG) {
-                switch ((wordcount - 1) - cmdi) {
-                case 0:
-                        subop = gf_strdup ("get-all");
-                        break;
-                case 1:
-                        if (words[cmdi + 1][0] == '!') {
-                                (words[cmdi + 1])++;
-                                if (gf_asprintf (&subop, "del%s", glob ? "-glob" : "") == -1)
-                                        subop = NULL;
-                        } else
-                                subop = gf_strdup ("get");
-
-                        ret = dict_set_str (dict, "op_name", ((char *)words[cmdi + 1]));
-                        if (ret < 0)
-                                goto out;
-                        break;
-                default:
-                        if (gf_asprintf (&subop, "set%s", glob ? "-glob" : "") == -1)
-                                subop = NULL;
-
-                        ret = dict_set_str (dict, "op_name", ((char *)words[cmdi + 1]));
-                        if (ret < 0)
-                                goto out;
-
-                        /* join the varargs by spaces to get the op_value */
-
-                        for (i = cmdi + 2; i < wordcount; i++)
-                                append_len += (strlen (words[i]) + 1);
-                        /* trailing strcat will add two bytes, make space for that */
-                        append_len++;
-
-                        append_str = GF_CALLOC (1, append_len, cli_mt_append_str);
-                        if (!append_str) {
-                                ret = -1;
-                                goto out;
-                        }
-
-                        for (i = cmdi + 2; i < wordcount; i++) {
-                                strcat (append_str, words[i]);
-                                strcat (append_str, " ");
-                        }
-                        append_str[append_len - 2] = '\0';
-
-                        ret = dict_set_dynstr (dict, "op_value", append_str);
-                }
-
-                if (!subop || dict_set_dynstr (dict, "subop", subop) != 0)
-                        ret = -1;
-        }
+        if (!ret && type == GF_GSYNC_OPTION_TYPE_CONFIG)
+                ret = config_parse (words, wordcount, dict, cmdi, glob);
 
 out:
         if (ret) {
                 if (dict)
                         dict_destroy (dict);
-                if (append_str)
-                        GF_FREE (append_str);
         } else
                 *options = dict;
 
+
         return ret;
 }
 
@@ -1512,7 +1932,7 @@ cli_cmd_volume_profile_parse (const char **words, int wordcount,
         if (!dict)
                 goto out;
 
-        if (wordcount != 4)
+        if (wordcount < 4 || wordcount >5)
                 goto out;
 
         volname = (char *)words[2];
@@ -1534,7 +1954,19 @@ cli_cmd_volume_profile_parse (const char **words, int wordcount,
                 op = GF_CLI_STATS_INFO;
         } else
                 GF_ASSERT (!"opword mismatch");
+
         ret = dict_set_int32 (dict, "op", (int32_t)op);
+        if (ret)
+                goto out;
+
+        if (wordcount == 5) {
+                if (!strcmp (words[4], "nfs")) {
+                        ret = dict_set_int32 (dict, "nfs", _gf_true);
+                        if (ret)
+                                goto out;
+                }
+        }
+
         *options = dict;
 out:
         if (ret && dict)
@@ -1556,12 +1988,13 @@ cli_cmd_volume_top_parse (const char **words, int wordcount,
         int32_t  list_cnt       = -1;
         int      index          = 0;
         int      perf           = 0;
-        int32_t  blk_size       = 0;
-        int32_t  count          = 0;
+        uint32_t  blk_size      = 0;
+        uint32_t  count         = 0;
+        gf_boolean_t nfs        = _gf_false;
         char    *delimiter      = NULL;
         char    *opwords[]      = { "open", "read", "write", "opendir",
                                     "readdir", "read-perf", "write-perf",
-                                    NULL };
+                                    "clear", NULL };
         char    *w = NULL;
 
         GF_ASSERT (words);
@@ -1606,13 +2039,30 @@ cli_cmd_volume_top_parse (const char **words, int wordcount,
         } else if (strcmp (w, "write-perf") == 0) {
                 top_op = GF_CLI_TOP_WRITE_PERF;
                 perf = 1;
+        } else if (strcmp (w, "clear") == 0) {
+                ret = dict_set_int32 (dict, "clear-stats", 1);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Could not set clear-stats in dict");
+                        goto out;
+                }
         } else
                 GF_ASSERT (!"opword mismatch");
         ret = dict_set_int32 (dict, "top-op", (int32_t)top_op);
         if (ret)
                 goto out;
 
-        for (index = 4; index < wordcount; index+=2) {
+        if ((wordcount > 4) && !strcmp (words[4], "nfs")) {
+                nfs = _gf_true;
+                ret = dict_set_int32 (dict, "nfs", nfs);
+                if (ret)
+                        goto out;
+                index = 5;
+        } else {
+                index = 4;
+        }
+
+        for (; index < wordcount; index+=2) {
 
                 key = (char *) words[index];
                 value = (char *) words[index+1];
@@ -1623,14 +2073,14 @@ cli_cmd_volume_top_parse (const char **words, int wordcount,
                 }
                 if (!strcmp (key, "brick")) {
                         delimiter = strchr (value, ':');
-                        if (!delimiter || delimiter == value 
+                        if (!delimiter || delimiter == value
                             || *(delimiter+1) != '/') {
-                                cli_out ("wrong brick type: %s, use <HOSTNAME>:"
+                                cli_err ("wrong brick type: %s, use <HOSTNAME>:"
                                          "<export-dir-abs-path>", value);
                                 ret = -1;
                                 goto out;
                         } else {
-                                ret = cli_canonicalize_path (delimiter + 1);
+                                ret = gf_canonicalize_path (delimiter + 1);
                                 if (ret)
                                         goto out;
                         }
@@ -1641,39 +2091,40 @@ cli_cmd_volume_top_parse (const char **words, int wordcount,
                         if (!ret)
                                 list_cnt = atoi (value);
                         if (ret || (list_cnt < 0) || (list_cnt > 100)) {
-                                cli_out ("list-cnt should be between 0 to 100");
+                                cli_err ("list-cnt should be between 0 to 100");
                                 ret = -1;
                                 goto out;
                         }
-                } else if (perf && !strcmp (key, "bs")) {
+                } else if (perf && !nfs && !strcmp (key, "bs")) {
                         ret = gf_is_str_int (value);
                         if (!ret)
                                 blk_size = atoi (value);
                         if (ret || (blk_size <= 0)) {
                                 if (blk_size < 0)
-                                        cli_out ("block size is an invalid number");
+                                        cli_err ("block size is an invalid"
+                                                 " number");
                                 else
-                                        cli_out ("block size should be an integer "
-                                         "greater than zero");
+                                        cli_err ("block size should be an "
+                                                 "integer greater than zero");
                                 ret = -1;
                                 goto out;
                         }
-                        ret = dict_set_int32 (dict, "blk-size", blk_size);
-                } else if (perf && !strcmp (key, "count")) {
+                        ret = dict_set_uint32 (dict, "blk-size", blk_size);
+                } else if (perf && !nfs && !strcmp (key, "count")) {
                         ret = gf_is_str_int (value);
                         if (!ret)
                                 count = atoi(value);
                         if (ret || (count <= 0)) {
                                 if (count < 0)
-                                        cli_out ("count is an invalid number");
-                                else 
-                                        cli_out ("count should be an integer "
+                                        cli_err ("count is an invalid number");
+                                else
+                                        cli_err ("count should be an integer "
                                                  "greater than zero");
 
                                 ret = -1;
                                 goto out;
                         }
-                        ret = dict_set_int32 (dict, "blk-cnt", count);
+                        ret = dict_set_uint32 (dict, "blk-cnt", count);
                 } else {
                         ret = -1;
                         goto out;
@@ -1693,9 +2144,17 @@ cli_cmd_volume_top_parse (const char **words, int wordcount,
         }
 
         if ((blk_size > 0) ^ (count > 0)) {
+                cli_err ("Need to give both 'bs' and 'count'");
+                ret = -1;
+                goto out;
+        } else if (((uint64_t)blk_size * count) > (10 * GF_UNIT_GB)) {
+                cli_err ("'bs * count' value %"PRIu64" is greater than "
+                         "maximum allowed value of 10GB",
+                         ((uint64_t)blk_size * count));
                 ret = -1;
                 goto out;
         }
+
         *options = dict;
 out:
         if (ret && dict)
@@ -1703,23 +2162,178 @@ out:
         return ret;
 }
 
-int32_t
+uint32_t
+cli_cmd_get_statusop (const char *arg)
+{
+        int        i         = 0;
+        uint32_t   ret       = GF_CLI_STATUS_NONE;
+        char      *w         = NULL;
+        char      *opwords[] = {"detail", "mem", "clients", "fd",
+                                "inode", "callpool", "tasks", NULL};
+        struct {
+                char      *opname;
+                uint32_t   opcode;
+        } optable[] = {
+                { "detail",   GF_CLI_STATUS_DETAIL   },
+                { "mem",      GF_CLI_STATUS_MEM      },
+                { "clients",  GF_CLI_STATUS_CLIENTS  },
+                { "fd",       GF_CLI_STATUS_FD       },
+                { "inode",    GF_CLI_STATUS_INODE    },
+                { "callpool", GF_CLI_STATUS_CALLPOOL },
+                { "tasks",    GF_CLI_STATUS_TASKS    },
+                { NULL }
+        };
+
+        w = str_getunamb (arg, opwords);
+        if (!w) {
+                gf_log ("cli", GF_LOG_DEBUG,
+                        "Not a status op  %s", arg);
+                goto out;
+        }
+
+        for (i = 0; optable[i].opname; i++) {
+                if (!strcmp (w, optable[i].opname)) {
+                        ret = optable[i].opcode;
+                        break;
+                }
+        }
+
+ out:
+        return ret;
+}
+
+int
 cli_cmd_volume_status_parse (const char **words, int wordcount,
-                                dict_t **options)
+                             dict_t **options)
 {
-        dict_t *dict            = NULL;
-        int     ret             = -1;
+        dict_t    *dict            = NULL;
+        int        ret             = -1;
+        uint32_t   cmd             = 0;
 
-        GF_ASSERT (words);
         GF_ASSERT (options);
 
         dict = dict_new ();
         if (!dict)
                 goto out;
 
-        GF_ASSERT(words[2]);
+        switch (wordcount) {
+
+        case 2:
+                cmd = GF_CLI_STATUS_ALL;
+                ret = 0;
+                break;
+
+        case 3:
+                if (!strcmp (words[2], "all")) {
+                        cmd = GF_CLI_STATUS_ALL;
+                        ret = 0;
+
+                } else {
+                        cmd = GF_CLI_STATUS_VOL;
+                        ret = dict_set_str (dict, "volname", (char *)words[2]);
+                }
+
+                break;
+
+        case 4:
+                cmd = cli_cmd_get_statusop (words[3]);
+
+                if (!strcmp (words[2], "all")) {
+                        if (cmd == GF_CLI_STATUS_NONE) {
+                                cli_err ("%s is not a valid status option",
+                                         words[3]);
+                                ret = -1;
+                                goto out;
+                        }
+                        cmd |= GF_CLI_STATUS_ALL;
+                        ret  = 0;
+
+                } else {
+                        ret = dict_set_str (dict, "volname",
+                                            (char *)words[2]);
+                        if (ret)
+                                goto out;
+
+                        if (cmd == GF_CLI_STATUS_NONE) {
+                                if (!strcmp (words[3], "nfs")) {
+                                        cmd |= GF_CLI_STATUS_NFS;
+                                } else if (!strcmp (words[3], "shd")) {
+                                        cmd |= GF_CLI_STATUS_SHD;
+                                } else {
+                                        cmd = GF_CLI_STATUS_BRICK;
+                                        ret = dict_set_str (dict, "brick",
+                                                            (char *)words[3]);
+                                }
+
+                        } else {
+                                cmd |= GF_CLI_STATUS_VOL;
+                                ret  = 0;
+                        }
+                }
+
+                break;
+
+        case 5:
+                if (!strcmp (words[2], "all")) {
+                        cli_err ("Cannot specify brick/nfs for \"all\"");
+                        ret = -1;
+                        goto out;
+                }
+
+                cmd = cli_cmd_get_statusop (words[4]);
+                if (cmd == GF_CLI_STATUS_NONE) {
+                        cli_err ("%s is not a valid status option",
+                                 words[4]);
+                        ret = -1;
+                        goto out;
+                }
+
+
+                ret = dict_set_str (dict, "volname", (char *)words[2]);
+                if (ret)
+                        goto out;
+
+                if (!strcmp (words[3], "nfs")) {
+                        if (cmd == GF_CLI_STATUS_FD ||
+                            cmd == GF_CLI_STATUS_DETAIL ||
+                            cmd == GF_CLI_STATUS_TASKS) {
+                                cli_err ("Detail/FD/Tasks status not available"
+                                         " for NFS Servers");
+                                ret = -1;
+                                goto out;
+                        }
+                        cmd |= GF_CLI_STATUS_NFS;
+                } else if (!strcmp (words[3], "shd")){
+                        if (cmd == GF_CLI_STATUS_FD ||
+                            cmd == GF_CLI_STATUS_CLIENTS ||
+                            cmd == GF_CLI_STATUS_DETAIL ||
+                            cmd == GF_CLI_STATUS_TASKS) {
+                                cli_err ("Detail/FD/Clients/Tasks status not "
+                                         "available for Self-heal Daemons");
+                                ret = -1;
+                                goto out;
+                        }
+                        cmd |= GF_CLI_STATUS_SHD;
+                } else {
+                        if (cmd == GF_CLI_STATUS_TASKS) {
+                                cli_err ("Tasks status not available for "
+                                         "bricks");
+                                ret = -1;
+                                goto out;
+                        }
+                        cmd |= GF_CLI_STATUS_BRICK;
+                        ret = dict_set_str (dict, "brick", (char *)words[3]);
+                }
+                break;
+
+        default:
+                goto out;
+        }
 
-        ret = dict_set_str (dict, "volname", (char *)words[2]);
+        if (ret)
+                goto out;
+
+        ret = dict_set_int32 (dict, "cmd", cmd);
         if (ret)
                 goto out;
 
@@ -1733,18 +2347,20 @@ cli_cmd_volume_status_parse (const char **words, int wordcount,
 }
 
 gf_boolean_t
-cli_cmd_validate_dumpoption (const char *option)
+cli_cmd_validate_dumpoption (const char *arg, char **option)
 {
-        char    *opwords[] = {"all", "mem", "iobuf", "callpool", "priv", "fd",
-                              "inode", NULL};
+        char    *opwords[] = {"all", "nfs", "mem", "iobuf", "callpool", "priv",
+                              "fd", "inode", "history", "inodectx", "fdctx",
+                              NULL};
         char    *w = NULL;
 
-        w = str_getunamb (option, opwords);
+        w = str_getunamb (arg, opwords);
         if (!w) {
                 gf_log ("cli", GF_LOG_DEBUG, "Unknown statedump option  %s",
-                        option);
+                        arg);
                 return _gf_false;
         }
+        *option = w;
         return _gf_true;
 }
 
@@ -1756,25 +2372,27 @@ cli_cmd_volume_statedump_options_parse (const char **words, int wordcount,
         int     i = 0;
         dict_t  *dict = NULL;
         int     option_cnt = 0;
+        char    *option = NULL;
         char    option_str[100] = {0,};
 
         for (i = 3; i < wordcount; i++, option_cnt++) {
-                if (!cli_cmd_validate_dumpoption (words[i])) {
+                if (!cli_cmd_validate_dumpoption (words[i], &option)) {
                         ret = -1;
                         goto out;
                 }
-                strncat (option_str, words[i], sizeof (words [i]));
+                strncat (option_str, option, strlen (option));
                 strncat (option_str, " ", 1);
         }
+
         dict = dict_new ();
         if (!dict)
                 goto out;
 
-        ret = dict_set_str (dict, "options", gf_strdup (option_str));
+        ret = dict_set_dynstr (dict, "options", gf_strdup (option_str));
         if (ret)
                 goto out;
 
-        ret = dict_set_int32 (dict, "option-cnt", option_cnt);
+        ret = dict_set_int32 (dict, "option_cnt", option_cnt);
         if (ret)
                 goto out;
 
@@ -1786,3 +2404,1279 @@ out:
                 gf_log ("cli", GF_LOG_ERROR, "Error parsing dumpoptions");
         return ret;
 }
+
+int
+cli_cmd_volume_clrlks_opts_parse (const char **words, int wordcount,
+                                  dict_t **options)
+{
+        int     ret = -1;
+        int     i = 0;
+        dict_t  *dict = NULL;
+        char    *kind_opts[4] = {"blocked", "granted", "all", NULL};
+        char    *types[4] = {"inode", "entry", "posix", NULL};
+        char    *free_ptr = NULL;
+
+        dict = dict_new ();
+        if (!dict)
+                goto out;
+
+        if (strcmp (words[4], "kind"))
+                goto out;
+
+        for (i = 0; kind_opts[i]; i++) {
+               if (!strcmp (words[5], kind_opts[i])) {
+                       free_ptr = gf_strdup (words[5]);
+                       ret = dict_set_dynstr (dict, "kind", free_ptr);
+                       if (ret)
+                               goto out;
+                       free_ptr = NULL;
+                       break;
+               }
+        }
+        if (i == 3)
+                goto out;
+
+        ret = -1;
+        for (i = 0; types[i]; i++) {
+               if (!strcmp (words[6], types[i])) {
+                       free_ptr = gf_strdup (words[6]);
+                       ret = dict_set_dynstr (dict, "type", free_ptr);
+                       if (ret)
+                               goto out;
+                       free_ptr = NULL;
+                       break;
+               }
+        }
+        if (i == 3)
+                goto out;
+
+        if (wordcount == 8) {
+                free_ptr = gf_strdup (words[7]);
+                ret = dict_set_dynstr (dict, "opts", free_ptr);
+                if (ret)
+                        goto out;
+                free_ptr = NULL;
+        }
+
+        ret = 0;
+        *options = dict;
+out:
+       if (ret) {
+               GF_FREE (free_ptr);
+               dict_unref (dict);
+       }
+
+       return ret;
+}
+
+static int
+extract_hostname_path_from_token (const char *tmp_words, char **hostname,
+                                  char **path)
+{
+        int ret = 0;
+        char *delimiter = NULL;
+        char *tmp_host = NULL;
+        char *host_name = NULL;
+        char *words = NULL;
+
+        *hostname = NULL;
+        *path = NULL;
+
+        words = GF_CALLOC (1, strlen (tmp_words) + 1, gf_common_mt_char);
+        if (!words){
+                ret = -1;
+                goto out;
+        }
+
+        strncpy (words, tmp_words, strlen (tmp_words) + 1);
+
+        if (validate_brick_name (words)) {
+                cli_err ("Wrong brick type: %s, use <HOSTNAME>:"
+                        "<export-dir-abs-path>", words);
+                ret = -1;
+                goto out;
+        } else {
+                delimiter = strrchr (words, ':');
+                ret = gf_canonicalize_path (delimiter + 1);
+                if (ret) {
+                        goto out;
+                } else {
+                        *path = GF_CALLOC (1, strlen (delimiter+1) +1,
+                                           gf_common_mt_char);
+                        if (!*path) {
+                           ret = -1;
+                                goto out;
+
+                        }
+                        strncpy (*path, delimiter +1,
+                                 strlen(delimiter + 1) + 1);
+                }
+        }
+
+        tmp_host = gf_strdup (words);
+        if (!tmp_host) {
+                gf_log ("cli", GF_LOG_ERROR, "Out of memory");
+                ret = -1;
+                goto out;
+        }
+        get_host_name (tmp_host, &host_name);
+        if (!host_name) {
+                ret = -1;
+                gf_log("cli",GF_LOG_ERROR, "Unable to allocate "
+                        "memory");
+                goto out;
+        }
+        if (!(strcmp (host_name, "localhost") &&
+            strcmp (host_name, "127.0.0.1") &&
+            strncmp (host_name, "0.", 2))) {
+                cli_err ("Please provide a valid hostname/ip other "
+                         "than localhost, 127.0.0.1 or loopback "
+                         "address (0.0.0.0 to 0.255.255.255).");
+                ret = -1;
+                goto out;
+        }
+        if (!valid_internet_address (host_name, _gf_false)) {
+                cli_err ("internet address '%s' does not conform to "
+                          "standards", host_name);
+                ret = -1;
+                goto out;
+        }
+
+        *hostname = GF_CALLOC (1, strlen (host_name) + 1,
+                                       gf_common_mt_char);
+        if (!*hostname) {
+                ret = -1;
+                goto out;
+        }
+        strncpy (*hostname, host_name, strlen (host_name) + 1);
+        ret = 0;
+
+out:
+        GF_FREE (words);
+        GF_FREE (tmp_host);
+        return ret;
+}
+
+
+int
+cli_cmd_volume_heal_options_parse (const char **words, int wordcount,
+                                   dict_t **options)
+{
+        int     ret = 0;
+        dict_t  *dict = NULL;
+        char    *hostname = NULL;
+        char    *path = NULL;
+
+        dict = dict_new ();
+        if (!dict)
+                goto out;
+
+        ret = dict_set_str (dict, "volname", (char *) words[2]);
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_ERROR, "failed to set volname");
+                goto out;
+        }
+
+        if (wordcount == 3) {
+                ret = dict_set_int32 (dict, "heal-op", GF_AFR_OP_HEAL_INDEX);
+                goto done;
+        }
+
+        if (wordcount == 4) {
+                if (!strcmp (words[3], "full")) {
+                        ret = dict_set_int32 (dict, "heal-op",
+                                              GF_AFR_OP_HEAL_FULL);
+                        goto done;
+                } else if (!strcmp (words[3], "statistics")) {
+                        ret = dict_set_int32 (dict, "heal-op",
+                                              GF_AFR_OP_STATISTICS);
+                        goto done;
+
+                } else if (!strcmp (words[3], "info")) {
+                        ret = dict_set_int32 (dict, "heal-op",
+                                              GF_AFR_OP_INDEX_SUMMARY);
+                        goto done;
+                } else {
+                        ret = -1;
+                        goto out;
+                }
+        }
+        if (wordcount == 5) {
+                if (strcmp (words[3], "info") &&
+                    strcmp (words[3], "statistics")) {
+                        ret = -1;
+                        goto out;
+                }
+
+                if (!strcmp (words[3], "info")) {
+                        if (!strcmp (words[4], "healed")) {
+                                ret = dict_set_int32 (dict, "heal-op",
+                                                      GF_AFR_OP_HEALED_FILES);
+                                goto done;
+                        }
+                        if (!strcmp (words[4], "heal-failed")) {
+                                ret = dict_set_int32 (dict, "heal-op",
+                                                   GF_AFR_OP_HEAL_FAILED_FILES);
+                                goto done;
+                        }
+                        if (!strcmp (words[4], "split-brain")) {
+                                ret = dict_set_int32 (dict, "heal-op",
+                                                   GF_AFR_OP_SPLIT_BRAIN_FILES);
+                                goto done;
+                        }
+                }
+
+                if (!strcmp (words[3], "statistics")) {
+                        if (!strcmp (words[4], "heal-count")) {
+                                ret = dict_set_int32 (dict, "heal-op",
+                                               GF_AFR_OP_STATISTICS_HEAL_COUNT);
+                                goto done;
+                        }
+                }
+                ret = -1;
+                goto out;
+        }
+        if (wordcount == 7) {
+                if (!strcmp (words[3], "statistics")
+                    && !strcmp (words[4], "heal-count")
+                    && !strcmp (words[5], "replica")) {
+
+                        ret = dict_set_int32 (dict, "heal-op",
+                                   GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA);
+                        if (ret)
+                                goto out;
+                        ret = extract_hostname_path_from_token (words[6],
+                                                              &hostname, &path);
+                        if (ret)
+                                goto out;
+                        ret = dict_set_dynstr (dict, "per-replica-cmd-hostname",
+                                               hostname);
+                        if (ret)
+                                goto out;
+                        ret = dict_set_dynstr (dict, "per-replica-cmd-path",
+                                               path);
+                        if (ret)
+                                goto out;
+                        else
+                                goto done;
+
+                }
+        }
+        ret = -1;
+        goto out;
+done:
+        *options = dict;
+out:
+        if (ret && dict) {
+                dict_unref (dict);
+                *options = NULL;
+        }
+
+        return ret;
+}
+
+int
+cli_cmd_volume_defrag_parse (const char **words, int wordcount,
+                             dict_t **options)
+{
+        dict_t                 *dict = NULL;
+        int                      ret = -1;
+        char                *option  = NULL;
+        char                *volname = NULL;
+        char                *command = NULL;
+        gf_cli_defrag_type       cmd = 0;
+
+        GF_ASSERT (words);
+        GF_ASSERT (options);
+
+        dict = dict_new ();
+        if (!dict)
+                goto out;
+
+        if (!((wordcount == 4) || (wordcount == 5)))
+                goto out;
+
+        if (wordcount == 4) {
+                if (strcmp (words[3], "start") && strcmp (words[3], "stop") &&
+                    strcmp (words[3], "status"))
+                            goto out;
+        } else {
+                if (strcmp (words[3], "fix-layout") &&
+                    strcmp (words[3], "start"))
+                        goto out;
+        }
+
+        volname = (char *) words[2];
+
+        if (wordcount == 4) {
+                command = (char *) words[3];
+        }
+        if (wordcount == 5) {
+               if ((strcmp (words[3], "fix-layout") ||
+                    strcmp (words[4], "start")) &&
+                    (strcmp (words[3], "start") ||
+                    strcmp (words[4], "force"))) {
+                        ret = -1;
+                        goto out;
+                }
+                command = (char *) words[3];
+                option = (char *) words[4];
+        }
+
+        if (strcmp (command, "start") == 0) {
+                cmd = GF_DEFRAG_CMD_START;
+                if (option && strcmp (option, "force") == 0) {
+                                cmd = GF_DEFRAG_CMD_START_FORCE;
+                        }
+                goto done;
+        }
+
+        if (strcmp (command, "fix-layout") == 0) {
+                cmd = GF_DEFRAG_CMD_START_LAYOUT_FIX;
+                goto done;
+        }
+        if (strcmp (command, "stop") == 0) {
+                cmd = GF_DEFRAG_CMD_STOP;
+                goto done;
+        }
+        if (strcmp (command, "status") == 0) {
+                cmd = GF_DEFRAG_CMD_STATUS;
+        }
+
+done:
+        ret = dict_set_str (dict, "volname", volname);
+
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_ERROR, "failed to set dict");
+                goto out;
+        }
+
+        ret = dict_set_int32 (dict, "rebalance-command", (int32_t) cmd);
+
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_ERROR, "failed to set dict");
+                goto out;
+        }
+
+        *options = dict;
+
+out:
+        if (ret && dict)
+                dict_destroy (dict);
+
+        return ret;
+}
+
+int32_t
+cli_snap_create_desc_parse (dict_t *dict, const char **words,
+                            size_t wordcount, int32_t desc_opt_loc)
+{
+        int32_t        ret      = -1;
+        char          *desc     = NULL;
+        int32_t        desc_len = 0;
+
+        desc = GF_CALLOC (MAX_SNAP_DESCRIPTION_LEN + 1, sizeof(char),
+                          gf_common_mt_char);
+        if (!desc) {
+                ret = -1;
+                goto out;
+        }
+
+
+        if (strlen (words[desc_opt_loc]) >= MAX_SNAP_DESCRIPTION_LEN) {
+                cli_out ("snapshot create: description truncated: "
+                         "Description provided is longer than 1024 characters");
+                desc_len = MAX_SNAP_DESCRIPTION_LEN;
+        } else {
+                desc_len = strlen (words[desc_opt_loc]);
+        }
+
+        strncpy (desc, words[desc_opt_loc], desc_len);
+        desc[desc_len] = '\0';
+        /* Calculating the size of the description as given by the user */
+
+        ret = dict_set_dynstr (dict, "description", desc);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to save snap "
+                        "description");
+                goto out;
+        }
+
+        ret = 0;
+out:
+        if (ret && desc)
+                GF_FREE (desc);
+
+        return ret;
+}
+
+/* Function to check whether the Volume name is repeated */
+int
+cli_check_if_volname_repeated (const char **words, unsigned int start_index,
+                           uint64_t cur_index) {
+        uint64_t        i       =       -1;
+        int             ret     =        0;
+
+        GF_ASSERT (words);
+
+        for (i = start_index ; i < cur_index ; i++) {
+                if (strcmp (words[i], words[cur_index]) == 0) {
+                        ret = -1;
+                        goto out;
+                }
+        }
+out :
+        return ret;
+}
+
+/* snapshot create <snapname> <vol-name(s)> [description <description>]
+ *                                           [force]
+ * @arg-0, dict     : Request Dictionary to be sent to server side.
+ * @arg-1, words    : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ *                 0 on success
+ */
+int
+cli_snap_create_parse (dict_t *dict, const char **words, int wordcount) {
+        uint64_t        i               =        0;
+        int             ret             =       -1;
+        uint64_t        volcount        =        0;
+        char            key[PATH_MAX]   =        "";
+        char            *snapname       =       NULL;
+        unsigned int    cmdi            =        2;
+        /* cmdi is command index, here cmdi is "2" (gluster snapshot create)*/
+
+        GF_ASSERT (words);
+        GF_ASSERT (dict);
+
+        if (wordcount <= cmdi + 1) {
+                cli_err ("Invalid Syntax.");
+                gf_log ("cli", GF_LOG_ERROR,
+                        "Too less words for snap create command");
+                goto out;
+        }
+
+        if (strlen(words[cmdi]) >= GLUSTERD_MAX_SNAP_NAME) {
+                cli_err ("snapshot create: failed: snapname cannot exceed "
+                         "255 characters.");
+                gf_log ("cli", GF_LOG_ERROR, "Snapname too long");
+
+                goto out;
+        }
+
+        snapname = (char *) words[cmdi];
+        for (i = 0 ; i < strlen (snapname); i++) {
+                /* Following volume name convention */
+                if (!isalnum (snapname[i]) && (snapname[i] != '_'
+                                           && (snapname[i] != '-'))) {
+                        /* TODO : Is this message enough?? */
+                        cli_err ("Snapname can contain only alphanumeric, "
+                                 "\"-\" and \"_\" characters");
+                        goto out;
+                }
+        }
+
+        ret = dict_set_str (dict, "snapname", (char *)words[cmdi]);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Could not save snap "
+                        "name");
+                goto out;
+        }
+
+        /* Filling volume name in the dictionary */
+        for (i = cmdi + 1 ; i < wordcount
+                            && (strcmp (words[i], "description")) != 0
+                            && (strcmp (words[i], "force") != 0); i++) {
+                volcount++;
+                /* volume index starts from 1 */
+                ret = snprintf (key, sizeof (key),"volname%ld", volcount);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = dict_set_str (dict, key, (char *)words[i]);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Could not "
+                                "save volume name");
+                        goto out;
+                }
+
+                if (i >= cmdi + 2) {
+                        ret = -1;
+                        cli_err("Creating multiple volume snapshot is not "
+                                "supported as of now");
+                        goto out;
+                }
+                /* TODO : remove this above condition check once
+                 * multiple volume snapshot is supported */
+        }
+
+        if (volcount == 0) {
+                ret = -1;
+                cli_err ("Please provide the volume name");
+                gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                goto out;
+        }
+
+        ret = dict_set_int32 (dict, "volcount", volcount);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Could not save volcount");
+                goto out;
+        }
+
+        /* Verify how we got out of "for" loop,
+         * if it is by reaching wordcount limit then goto "out",
+         * because we need not parse for "description" and "force"
+         * after this.
+         */
+        if (i == wordcount) {
+                goto out;
+        }
+
+        if ((strcmp (words[i], "description")) == 0) {
+                ++i;
+                if (i > (wordcount - 1)) {
+                        ret = -1;
+                        cli_err ("Please provide a description");
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Description not provided");
+                        goto out;
+                }
+
+                ret = cli_snap_create_desc_parse(dict, words, wordcount, i);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Could not save snap "
+                                "description");
+                        goto out;
+                }
+
+                if ( i == (wordcount - 1))
+                        goto out;
+                i++;
+                /* point the index to next word.
+                 * As description might be follwed by force option.
+                 * Before that, check if wordcount limit is reached
+                 */
+        }
+
+        if ((strcmp (words[i], "force") != 0)) {
+                ret = -1;
+                cli_err ("Invalid Syntax.");
+                gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                goto out;
+        }
+        ret = dict_set_int8 (dict, "snap-force", 1);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Could not save "
+                        "snap force option");
+                goto out;
+        }
+
+        /* Check if the command has anything after "force" keyword */
+        if (++i < wordcount) {
+                ret = -1;
+                gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                goto out;
+        }
+
+        ret = 0;
+
+out :
+        return ret;
+}
+
+/* snapshot list [volname]
+ * @arg-0, dict     : Request Dictionary to be sent to server side.
+ * @arg-1, words    : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ *                 0 on success
+ */
+int
+cli_snap_list_parse (dict_t *dict, const char **words, int wordcount) {
+        int             ret     =       -1;
+
+        GF_ASSERT (words);
+        GF_ASSERT (dict);
+
+        if (wordcount < 2 || wordcount > 3) {
+                gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                goto out;
+        }
+
+        if (wordcount == 2) {
+                ret = 0;
+                goto out;
+        }
+
+        ret = dict_set_str (dict, "volname", (char *)words[2]);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR,
+                        "Failed to save volname in dictionary");
+                goto out;
+        }
+out :
+        return ret;
+}
+
+/* snapshot info [(snapname |  volume <volname>)]
+ * @arg-0, dict     : Request Dictionary to be sent to server side.
+ * @arg-1, words    : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ *                 0 on success
+ */
+int
+cli_snap_info_parse (dict_t *dict, const char **words, int wordcount)
+{
+
+        int             ret             =       -1;
+        int32_t         cmd             =       GF_SNAP_INFO_TYPE_ALL;
+        unsigned int    cmdi            =        2;
+        /* cmdi is command index, here cmdi is "2" (gluster snapshot info)*/
+
+        GF_ASSERT (words);
+        GF_ASSERT (dict);
+
+        if (wordcount > 4 || wordcount < cmdi) {
+                gf_log ("", GF_LOG_ERROR, "Invalid syntax");
+                goto out;
+        }
+
+        if (wordcount == cmdi) {
+                ret = 0;
+                goto out;
+        }
+
+        /* If 3rd word is not "volume", then it must
+         * be snapname.
+         */
+        if (strcmp (words[cmdi], "volume") != 0) {
+                ret = dict_set_str (dict, "snapname",
+                                   (char *)words[cmdi]);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Unable to save "
+                                "snapname %s", words[cmdi]);
+                        goto out;
+                }
+
+                /* Once snap name is parsed, if we encounter any other
+                 * word then fail it. Invalid Syntax.
+                 * example : snapshot info <snapname> word
+                 */
+                if ((cmdi + 1) != wordcount) {
+                        ret = -1;
+                        gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                        goto out;
+                }
+
+                cmd = GF_SNAP_INFO_TYPE_SNAP;
+                ret = 0;
+                goto out;
+                /* No need to continue the parsing once we
+                 * get the snapname
+                 */
+        }
+
+        /* If 3rd word is "volume", then check if next word
+         * is present. As, "snapshot info volume" is an
+         * invalid command.
+         */
+        if ((cmdi + 1) == wordcount) {
+                ret = -1;
+                gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                goto out;
+        }
+
+        ret = dict_set_str (dict, "volname", (char *)words[wordcount - 1]);
+        if (ret) {
+                gf_log ("", GF_LOG_ERROR, "Count not save "
+                        "volume name %s", words[wordcount - 1]);
+                goto out;
+        }
+        cmd = GF_SNAP_INFO_TYPE_VOL;
+out :
+        if (ret == 0) {
+                ret = dict_set_int32 (dict, "cmd", cmd);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Could not save "
+                                "type of snapshot info");
+                }
+        }
+        return ret;
+}
+
+
+
+/* snapshot restore <snapname>
+ * @arg-0, dict     : Request Dictionary to be sent to server side.
+ * @arg-1, words    : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ *                 0 on success
+ */
+int
+cli_snap_restore_parse (dict_t *dict, const char **words, int wordcount)
+{
+
+        int             ret             =       -1;
+
+        GF_ASSERT (words);
+        GF_ASSERT (dict);
+
+        if (wordcount != 3) {
+                gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                goto out;
+        }
+
+        ret = dict_set_str (dict, "snapname", (char *)words[2]);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to save snap-name %s",
+                        words[2]);
+                goto out;
+        }
+out :
+        return ret;
+}
+
+/* snapshot delete <snapname>
+ * @arg-0, dict     : Request Dictionary to be sent to server side.
+ * @arg-1, words    : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ *                 0 on success
+ *                 1 if user cancel the operation
+ */
+int
+cli_snap_delete_parse (dict_t *dict, const char **words, int wordcount,
+                       struct cli_state *state) {
+
+        int             ret             =       -1;
+        const char      *question       =       NULL;
+        gf_answer_t     answer          =       GF_ANSWER_NO;
+
+        question = "Deleting snap will erase all the information about "
+                   "the snap. Do you still want to continue?";
+
+        GF_ASSERT (words);
+        GF_ASSERT (dict);
+
+        if (wordcount != 3) {
+                gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                goto out;
+        }
+
+        ret = dict_set_str (dict, "snapname", (char *)words[2]);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to save snapname %s",
+                        words[2]);
+                goto out;
+        }
+
+        answer = cli_cmd_get_confirmation (state, question);
+        if (GF_ANSWER_NO == answer) {
+                ret = 1;
+                gf_log ("cli", GF_LOG_DEBUG, "User cancelled "
+                        "snapshot delete operation");
+                goto out;
+        }
+out :
+        return ret;
+}
+
+/* snapshot status [(snapname | volume <volname>)]
+ * @arg-0, dict     : Request Dictionary to be sent to server side.
+ * @arg-1, words    : Contains individual words of CLI command.
+ * @arg-2, wordcount: Contains number of words present in the CLI command.
+ *
+ * return value : -1 on failure
+ *                 0 on success
+ */
+int
+cli_snap_status_parse (dict_t *dict, const char **words, int wordcount)
+{
+
+        int             ret  =        -1;
+        int32_t         cmd  =       GF_SNAP_STATUS_TYPE_ALL;
+        unsigned int    cmdi =        2;
+        /* cmdi is command index, here cmdi is "2" (gluster snapshot status)*/
+
+        GF_ASSERT (words);
+        GF_ASSERT (dict);
+
+        if (wordcount > 4 || wordcount < cmdi) {
+                gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                goto out;
+        }
+
+        if (wordcount == cmdi) {
+                ret = 0;
+                goto out;
+        }
+
+        /* if 3rd word is not "volume", then it must be "snapname"
+        */
+        if (strcmp (words[cmdi], "volume") != 0) {
+                ret = dict_set_str (dict, "snapname",
+                                   (char *)words[cmdi]);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Count not save "
+                                "snap name %s", words[cmdi]);
+                        goto out;
+                }
+
+                if ((cmdi + 1) != wordcount) {
+                        ret = -1;
+                        gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                        goto out;
+                }
+
+                ret = 0;
+                cmd = GF_SNAP_STATUS_TYPE_SNAP;
+                goto out;
+        }
+
+        /* If 3rd word is "volume", then check if next word is present.
+         * As, "snapshot info volume" is an invalid command
+         */
+        if ((cmdi + 1) == wordcount) {
+                ret = -1;
+                gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                goto out;
+        }
+
+        ret = dict_set_str (dict, "volname", (char *)words [wordcount - 1]);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Count not save "
+                        "volume name %s", words[wordcount - 1]);
+                goto out;
+        }
+        cmd = GF_SNAP_STATUS_TYPE_VOL;
+
+out :
+        if (ret == 0) {
+                ret = dict_set_int32 (dict, "cmd", cmd);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Could not save cmd "
+                                "of snapshot status");
+                }
+        }
+        return ret;
+}
+
+
+int32_t
+cli_snap_config_limit_parse (const char **words, dict_t *dict,
+                             unsigned int wordcount, unsigned int index,
+                             char *key)
+{
+        int             ret             =       -1;
+        int             limit           =       0;
+
+        GF_ASSERT (words);
+        GF_ASSERT (dict);
+        GF_ASSERT (key);
+
+        if (index >= wordcount) {
+                ret = -1;
+                cli_err ("Please provide a value for %s.",key);
+                gf_log ("cli", GF_LOG_ERROR, "Value not provided for %s", key);
+                goto out;
+        }
+
+        limit = strtol (words[index], NULL, 0);
+        if (limit <= 0) {
+                ret = -1;
+                cli_err ("%s should be greater than 0.", key);
+                goto out;
+        }
+
+        ret = dict_set_int32 (dict, key, limit);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Could not set "
+                        "%s in dictionary", key);
+                goto out;
+        }
+
+out :
+        return ret;
+}
+
+/* function cli_snap_config_parse
+ * Config Syntax : gluster snapshot config [volname]
+ *                                         [snap-max-hard-limit <count>]
+ *                                         [snap-max-soft-limit <count>]
+ *
+   return value: <0  on failure
+                  1  if user cancels the operation
+                  0  on success
+
+  NOTE : snap-max-soft-limit can only be set for system.
+*/
+int32_t
+cli_snap_config_parse (const char **words, int wordcount, dict_t *dict,
+                       struct cli_state *state)
+{
+        int                            ret                 = -1;
+        gf_answer_t                    answer              = GF_ANSWER_NO;
+        gf_boolean_t                   vol_presence        = _gf_false;
+        struct snap_config_opt_vals_  *conf_vals           = NULL;
+        int8_t                         hard_limit          = 0;
+        int8_t                         soft_limit          = 0;
+        int8_t                         config_type         = -1;
+        const char                    *question            = NULL;
+        unsigned int                    cmdi               = 2;
+        /* cmdi is command index, here cmdi is "2" (gluster snapshot config)*/
+
+        GF_ASSERT (words);
+        GF_ASSERT (dict);
+        GF_ASSERT (state);
+
+        if ((wordcount < 2) || (wordcount > 7)) {
+                gf_log ("cli", GF_LOG_ERROR,
+                        "Invalid wordcount(%d)", wordcount);
+                goto out;
+        }
+
+        if (wordcount == 2) {
+                config_type = GF_SNAP_CONFIG_DISPLAY;
+                ret = 0;
+                goto set;
+        }
+
+        /* Check whether the 3rd word is volname */
+        if (strcmp (words[cmdi], "snap-max-hard-limit") != 0
+             && strcmp (words[cmdi], "snap-max-soft-limit") != 0) {
+                ret = dict_set_str (dict, "volname", (char *)words[cmdi]);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to set volname");
+                        goto out;
+                }
+                cmdi++;
+                vol_presence = _gf_true;
+
+                if (cmdi == wordcount) {
+                        config_type = GF_SNAP_CONFIG_DISPLAY;
+                        ret = 0;
+                        goto set;
+                }
+        }
+
+        config_type = GF_SNAP_CONFIG_TYPE_SET;
+
+        if (strcmp (words[cmdi], "snap-max-hard-limit") == 0) {
+                ret = cli_snap_config_limit_parse (words, dict, wordcount,
+                                                ++cmdi, "snap-max-hard-limit");
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to parse snap "
+                                "config hard limit");
+                        goto out;
+                }
+                hard_limit = 1;
+
+                if (++cmdi == wordcount) {
+                        ret = 0;
+                        goto set;
+                }
+        }
+
+        if (strcmp (words[cmdi], "snap-max-soft-limit") == 0) {
+                if (vol_presence == 1) {
+                        ret = -1;
+                        cli_err ("Soft limit cannot be set to individual "
+                                  "volumes.");
+                        gf_log ("cli", GF_LOG_ERROR, "Soft limit cannot be "
+                                "set to volumes");
+                        goto out;
+                }
+
+                ret = cli_snap_config_limit_parse (words, dict, wordcount,
+                                                ++cmdi, "snap-max-soft-limit");
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to parse snap "
+                                "config soft limit");
+                        goto out;
+                }
+
+                if (++cmdi != wordcount) {
+                        ret = -1;
+                        gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                        goto out;
+                }
+                soft_limit = 1;
+        } else {
+                ret = -1;
+                gf_log ("cli", GF_LOG_ERROR, "Invalid Syntax");
+                goto out;
+        }
+        ret = 0; /* Success */
+
+set:
+        ret = dict_set_int32 (dict, "config-command", config_type);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to set "
+                        "config-command");
+                goto out;
+        }
+
+        if (config_type == GF_SNAP_CONFIG_TYPE_SET) {
+                conf_vals = snap_confopt_vals;
+                if (hard_limit && soft_limit) {
+                        question = conf_vals[GF_SNAP_CONFIG_SET_BOTH].question;
+                } else if (soft_limit) {
+                        question = conf_vals[GF_SNAP_CONFIG_SET_SOFT].question;
+                } else if (hard_limit) {
+                        question = conf_vals[GF_SNAP_CONFIG_SET_HARD].question;
+                }
+
+                answer = cli_cmd_get_confirmation (state, question);
+                if (GF_ANSWER_NO == answer) {
+                        ret = 1;
+                        gf_log ("cli", GF_LOG_DEBUG, "User cancelled "
+                        "snapshot config operation");
+                }
+        }
+
+out:
+        return ret;
+}
+
+int
+validate_snapname (const char *snapname, char **opwords) {
+        int     ret     =       -1;
+        int     i       =       0;
+
+        GF_ASSERT (snapname);
+        GF_ASSERT (opwords);
+
+        for (i = 0 ; opwords[i] != NULL; i++) {
+                if (strcmp (opwords[i], snapname) == 0) {
+                        cli_out ("\"%s\" cannot be a snapname", snapname);
+                        goto out;
+                }
+        }
+        ret = 0;
+out :
+        return ret;
+}
+
+int32_t
+cli_cmd_snapshot_parse (const char **words, int wordcount, dict_t **options,
+                        struct cli_state *state)
+{
+        int32_t            ret        = -1;
+        dict_t             *dict      = NULL;
+        gf1_cli_snapshot   type       = GF_SNAP_OPTION_TYPE_NONE;
+        char               *w         = NULL;
+        char               *opwords[] = {"create", "delete", "restore", "start",
+                                         "stop", "list", "status", "config",
+                                         "info", NULL};
+        char               *invalid_snapnames[] = {"description", "force",
+                                                  "volume", NULL};
+
+        GF_ASSERT (words);
+        GF_ASSERT (options);
+        GF_ASSERT (state);
+
+        dict = dict_new ();
+        if (!dict)
+                goto out;
+
+        /* Lowest wordcount possible */
+        if (wordcount < 2) {
+                gf_log ("", GF_LOG_ERROR,
+                        "Invalid command: Not enough arguments");
+                goto out;
+        }
+
+        w = str_getunamb (words[1], opwords);
+        if (!w) {
+                /* Checks if the operation is a valid operation */
+                gf_log ("", GF_LOG_ERROR, "Opword Mismatch");
+                goto out;
+        }
+
+        if (!strcmp (w, "create")) {
+                type = GF_SNAP_OPTION_TYPE_CREATE;
+        } else if (!strcmp (w, "list")) {
+                type = GF_SNAP_OPTION_TYPE_LIST;
+        } else if (!strcmp (w, "info")) {
+                type = GF_SNAP_OPTION_TYPE_INFO;
+        } else if (!strcmp (w, "delete")) {
+                type = GF_SNAP_OPTION_TYPE_DELETE;
+        } else if (!strcmp (w, "config")) {
+                type = GF_SNAP_OPTION_TYPE_CONFIG;
+        } else if (!strcmp (w, "restore")) {
+                type = GF_SNAP_OPTION_TYPE_RESTORE;
+        } else if (!strcmp (w, "status")) {
+                type = GF_SNAP_OPTION_TYPE_STATUS;
+        }
+
+        if (type != GF_SNAP_OPTION_TYPE_CONFIG) {
+                ret = dict_set_int32 (dict, "hold_snap_locks", _gf_true);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Unable to set hold-snap-locks value "
+                                "as _gf_true");
+                        goto out;
+                }
+        }
+
+        /* Check which op is intended */
+        switch (type) {
+               case GF_SNAP_OPTION_TYPE_CREATE:
+               {
+                        /* Syntax :
+                         * gluster snapshot create <snapname> <vol-name(s)>
+                         *                         [description <description>]
+                         *                         [force]
+                         */
+
+                        /* In cases where the snapname is not given then
+                         * parsing fails & snapname cannot be "description",
+                         * "force" and "volume", that check is made here
+                         */
+                       if (wordcount == 2){
+                                ret = -1;
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Invalid Syntax");
+                                goto out;
+                       }
+
+                       ret = validate_snapname (words[2], invalid_snapnames);
+                       if (ret) {
+                               goto out;
+                       }
+
+                       ret = cli_snap_create_parse (dict, words, wordcount);
+                       if (ret) {
+                               gf_log ("cli", GF_LOG_ERROR,
+                                       "create command parsing failed.");
+                               goto out;
+                       }
+                       break;
+               }
+                case GF_SNAP_OPTION_TYPE_INFO:
+                {
+                        /* Syntax :
+                         * gluster snapshot info [(snapname] | [vol <volname>)]
+                         */
+                        ret = cli_snap_info_parse (dict, words, wordcount);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR, "Failed to parse "
+                                        "snapshot info command");
+                                goto out;
+                        }
+                        break;
+                }
+
+                case GF_SNAP_OPTION_TYPE_LIST:
+                {
+                        /* Syntax :
+                         * gluster snaphsot list [volname]
+                         */
+
+                        ret = cli_snap_list_parse (dict, words, wordcount);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR, "Failed to parse "
+                                        "snapshot list command");
+                                goto out;
+                        }
+                        break;
+                }
+
+                case GF_SNAP_OPTION_TYPE_DELETE:
+                {
+                        /* Syntax :
+                         * gluster snapshot delete <snapname>
+                         */
+                        ret = cli_snap_delete_parse (dict, words, wordcount,
+                                                     state);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR, "Failed to parse "
+                                        "snapshot delete command");
+                                goto out;
+                        }
+                        break;
+                }
+
+               case GF_SNAP_OPTION_TYPE_CONFIG:
+               {
+                /* snapshot config [volname]  [snap-max-hard-limit <count>]
+                 *                            [snap-max-soft-limit <percent>] */
+                       ret = cli_snap_config_parse (words, wordcount, dict,
+                                                    state);
+                       if (ret) {
+                                if (ret < 0)
+                                        gf_log ("cli", GF_LOG_ERROR,
+                                              "config command parsing failed.");
+                               goto out;
+                       }
+
+                       ret = dict_set_int32 (dict, "type",
+                                             GF_SNAP_OPTION_TYPE_CONFIG);
+                       if (ret) {
+                               gf_log ("cli", GF_LOG_ERROR, "Unable to set "
+                                       "config type");
+                               ret = -1;
+                               goto out;
+                       }
+                       break;
+               }
+                case GF_SNAP_OPTION_TYPE_STATUS:
+                {
+                        /* Syntax :
+                         * gluster snapshot status [(snapname |
+                         *                         volume <volname>)]
+                         */
+                        ret = cli_snap_status_parse (dict, words, wordcount);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR, "Failed to parse "
+                                        "snapshot status command");
+                                goto out;
+                        }
+                        break;
+                }
+
+               case GF_SNAP_OPTION_TYPE_RESTORE:
+               {
+                       /* Syntax:
+                        * snapshot restore <snapname>
+                        */
+                       ret = cli_snap_restore_parse (dict, words, wordcount);
+                       if (ret) {
+                               gf_log ("cli", GF_LOG_ERROR, "Failed to parse "
+                                       "restore command");
+                               goto out;
+                       }
+                       break;
+               }
+               default:
+                       gf_log ("", GF_LOG_ERROR, "Opword Mismatch");
+                       goto out;
+                       break;
+        }
+
+        ret = dict_set_int32 (dict, "type", type);
+        if (ret) {
+                gf_log ("", GF_LOG_ERROR,
+                        "Failed to set type.");
+                goto out;
+        }
+        /* If you got so far, input is valid */
+        ret = 0;
+out:
+        if (ret) {
+                if (dict)
+                        dict_destroy (dict);
+        } else
+                *options = dict;
+
+        return ret;
+}
diff --git a/cli/src/cli-cmd-peer.c b/cli/src/cli-cmd-peer.c
index 3b41195a1..551312411 100644
--- a/cli/src/cli-cmd-peer.c
+++ b/cli/src/cli-cmd-peer.c
@@ -1,22 +1,12 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -31,6 +21,7 @@
 #include "cli.h"
 #include "cli-cmd.h"
 #include "cli-mem-types.h"
+#include "cli1-xdr.h"
 #include "protocol-common.h"
 
 extern struct rpc_clnt *global_rpc;
@@ -50,6 +41,7 @@ cli_cmd_peer_probe_cbk (struct cli_state *state, struct cli_cmd_word *word,
         dict_t                  *dict = NULL;
         int                     sent = 0;
         int                     parse_error = 0;
+        cli_local_t             *local = NULL;
 
         if (!(wordcount == 3)) {
                 cli_usage_out (word->pattern);
@@ -71,12 +63,14 @@ cli_cmd_peer_probe_cbk (struct cli_state *state, struct cli_cmd_word *word,
         if (ret)
                 goto out;
 
-        ret = valid_internet_address ((char *) words[2]);
+        ret = valid_internet_address ((char *) words[2], _gf_false);
         if (ret == 1) {
                 ret = 0;
         } else {
+                cli_out ("%s is an invalid address", words[2]);
                 cli_usage_out (word->pattern);
                 parse_error = 1;
+                ret = -1;
                 goto out;
         }
 /*        if (words[3]) {
@@ -85,6 +79,9 @@ cli_cmd_peer_probe_cbk (struct cli_state *state, struct cli_cmd_word *word,
                         goto out;
         }
 */
+
+        CLI_LOCAL_INIT (local, words, frame, dict);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, dict);
         }
@@ -95,6 +92,9 @@ out:
                 if ((sent == 0) && (parse_error == 0))
                         cli_out ("Peer probe failed");
         }
+
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -107,10 +107,12 @@ cli_cmd_peer_deprobe_cbk (struct cli_state *state, struct cli_cmd_word *word,
         rpc_clnt_procedure_t *proc  = NULL;
         call_frame_t         *frame = NULL;
         dict_t               *dict  = NULL;
+        int                  flags = 0;
         int                  sent = 0;
         int                  parse_error = 0;
+        cli_local_t          *local = NULL;
 
-        if (!(wordcount == 3) ) {
+        if ((wordcount < 3) || (wordcount > 4)) {
                 cli_usage_out (word->pattern);
                 parse_error = 1;
                 goto out;
@@ -134,6 +136,22 @@ cli_cmd_peer_deprobe_cbk (struct cli_state *state, struct cli_cmd_word *word,
                         goto out;
         }
 */
+        if (wordcount == 4) {
+                if (!strcmp("force", words[3]))
+                        flags |= GF_CLI_FLAG_OP_FORCE;
+                else {
+                        ret = -1;
+                        cli_usage_out (word->pattern);
+                        parse_error = 1;
+                        goto out;
+                }
+        }
+        ret = dict_set_int32 (dict, "flags", flags);
+        if (ret)
+                goto out;
+
+        CLI_LOCAL_INIT (local, words, frame, dict);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, dict);
         }
@@ -145,6 +163,8 @@ out:
                         cli_out ("Peer detach failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -171,7 +191,7 @@ cli_cmd_peer_status_cbk (struct cli_state *state, struct cli_cmd_word *word,
                 goto out;
 
         if (proc->fn) {
-                ret = proc->fn (frame, THIS, (char *)words[1] );
+                ret = proc->fn (frame, THIS, (void *)GF_CLI_LIST_PEERS);
         }
 
 out:
@@ -180,6 +200,48 @@ out:
                 if ((sent == 0) && (parse_error == 0))
                         cli_out ("Peer status failed");
         }
+
+        CLI_STACK_DESTROY (frame);
+
+        return ret;
+}
+
+int
+cli_cmd_pool_list_cbk (struct cli_state *state, struct cli_cmd_word *word,
+                       const char **words, int wordcount)
+{
+        int                     ret = -1;
+        rpc_clnt_procedure_t    *proc = NULL;
+        call_frame_t            *frame = NULL;
+        int                     sent = 0;
+        int                     parse_error = 0;
+
+        if (wordcount != 2) {
+                cli_usage_out (word->pattern);
+                parse_error = 1;
+                goto out;
+        }
+
+        proc = &cli_rpc_prog->proctable[GLUSTER_CLI_LIST_FRIENDS];
+
+        frame = create_frame (THIS, THIS->ctx->pool);
+        if (!frame)
+                goto out;
+
+        if (proc->fn) {
+                ret = proc->fn (frame, THIS,
+                                (void *)GF_CLI_LIST_POOL_NODES);
+        }
+
+out:
+        if (ret) {
+                cli_cmd_sent_status_get (&sent);
+                if ((sent == 0) && (parse_error == 0))
+                        cli_err ("pool list: command execution failed");
+        }
+
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -188,7 +250,7 @@ struct cli_cmd cli_probe_cmds[] = {
           cli_cmd_peer_probe_cbk,
           "probe peer specified by <HOSTNAME>"},
 
-        { "peer detach <HOSTNAME>",
+        { "peer detach <HOSTNAME> [force]",
           cli_cmd_peer_deprobe_cbk,
           "detach peer specified by <HOSTNAME>"},
 
@@ -200,6 +262,10 @@ struct cli_cmd cli_probe_cmds[] = {
            cli_cmd_peer_help_cbk,
            "Help command for peer "},
 
+        { "pool list",
+          cli_cmd_pool_list_cbk,
+          "list all the nodes in the pool (including localhost)"},
+
         { NULL, NULL, NULL }
 };
 
diff --git a/cli/src/cli-cmd-snapshot.c b/cli/src/cli-cmd-snapshot.c
new file mode 100644
index 000000000..de492d683
--- /dev/null
+++ b/cli/src/cli-cmd-snapshot.c
@@ -0,0 +1,146 @@
+/*
+   Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <pthread.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "cli.h"
+#include "cli-cmd.h"
+
+extern rpc_clnt_prog_t *cli_rpc_prog;
+
+int
+cli_cmd_snapshot_help_cbk (struct cli_state *state, struct cli_cmd_word *in_word,
+                           const char **words, int wordcount);
+
+int
+cli_cmd_snapshot_cbk (struct cli_state *state, struct cli_cmd_word *word,
+                      const char **words, int wordcount)
+{
+        int                      ret       = 0;
+        int                      parse_err = 0;
+        dict_t                  *options   = NULL;
+        rpc_clnt_procedure_t    *proc      = NULL;
+        call_frame_t            *frame     = NULL;
+        cli_local_t             *local     = NULL;
+
+        proc = &cli_rpc_prog->proctable [GLUSTER_CLI_SNAP];
+        if (proc == NULL) {
+               ret = -1;
+                goto out;
+        }
+
+        frame = create_frame (THIS, THIS->ctx->pool);
+        if (frame == NULL) {
+                ret = -1;
+                goto out;
+        }
+
+        /* Parses the command entered by the user */
+        ret = cli_cmd_snapshot_parse (words, wordcount, &options, state);
+        if (ret) {
+                if (ret < 0) {
+                        cli_usage_out (word->pattern);
+                        parse_err = 1;
+                }
+                else {
+                        /* User might have cancelled the snapshot operation */
+                        ret = 0;
+                }
+                goto out;
+        }
+
+        CLI_LOCAL_INIT (local, words, frame, options);
+
+        if (proc->fn)
+                ret = proc->fn (frame, THIS, options);
+
+out:
+        if (ret && parse_err == 0)
+                cli_out ("Snapshot command failed");
+
+        CLI_STACK_DESTROY (frame);
+
+        return ret;
+}
+
+struct cli_cmd snapshot_cmds[] = {
+        { "snapshot help",
+          cli_cmd_snapshot_help_cbk,
+          "display help for snapshot commands"
+        },
+        { "snapshot create <snapname> <volname(s)> [description <description>] [force]",
+          cli_cmd_snapshot_cbk,
+          "Snapshot Create."
+        },
+        { "snapshot restore <snapname>",
+          cli_cmd_snapshot_cbk,
+          "Snapshot Restore."
+        },
+        { "snapshot status [(snapname | volume <volname>)]",
+          cli_cmd_snapshot_cbk,
+          "Snapshot Status."
+        },
+        { "snapshot info [(snapname | volume <volname>)]",
+          cli_cmd_snapshot_cbk,
+          "Snapshot Info."
+        },
+        { "snapshot list [volname]",
+          cli_cmd_snapshot_cbk,
+          "Snapshot List."
+        },
+        {"snapshot config [volname] [snap-max-hard-limit <count>] [snap-max-soft-limit <percent>]",
+          cli_cmd_snapshot_cbk,
+          "Snapshot Config."
+        },
+        {"snapshot delete <snapname>",
+          cli_cmd_snapshot_cbk,
+          "Snapshot Delete."
+        },
+        { NULL, NULL, NULL }
+};
+
+int
+cli_cmd_snapshot_help_cbk (struct cli_state *state,
+                           struct cli_cmd_word *in_word,
+                           const char **words,
+                           int wordcount)
+{
+        struct cli_cmd        *cmd = NULL;
+
+        for (cmd = snapshot_cmds; cmd->pattern; cmd++)
+                if (_gf_false == cmd->disable)
+                        cli_out ("%s - %s", cmd->pattern, cmd->desc);
+
+        return 0;
+}
+
+int
+cli_cmd_snapshot_register (struct cli_state *state)
+{
+        int  ret = 0;
+        struct cli_cmd *cmd = NULL;
+
+        for (cmd = snapshot_cmds; cmd->pattern; cmd++) {
+
+                ret = cli_cmd_register (&state->tree, cmd);
+                if (ret)
+                        goto out;
+        }
+out:
+        return ret;
+}
diff --git a/cli/src/cli-cmd-system.c b/cli/src/cli-cmd-system.c
index 25938b897..8cfa5e70c 100644
--- a/cli/src/cli-cmd-system.c
+++ b/cli/src/cli-cmd-system.c
@@ -1,22 +1,12 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -41,6 +31,12 @@ extern rpc_clnt_prog_t *cli_rpc_prog;
 int cli_cmd_system_help_cbk (struct cli_state *state, struct cli_cmd_word *in_word,
                              const char **words, int wordcount);
 
+int cli_cmd_copy_file_cbk (struct cli_state *state, struct cli_cmd_word *word,
+                           const char **words, int wordcount);
+
+int cli_cmd_sys_exec_cbk (struct cli_state *state, struct cli_cmd_word *word,
+                          const char **words, int wordcount);
+
 int
 cli_cmd_getspec_cbk (struct cli_state *state, struct cli_cmd_word *word,
                      const char **words, int wordcount)
@@ -288,6 +284,114 @@ cli_cmd_umount_cbk (struct cli_state *state, struct cli_cmd_word *word,
         return ret;
 }
 
+int
+cli_cmd_uuid_get_cbk (struct cli_state *state, struct cli_cmd_word *word,
+                      const char **words, int wordcount)
+{
+        int                     ret = -1;
+        int                     sent = 0;
+        int                     parse_error = 0;
+        dict_t                  *dict  = NULL;
+        rpc_clnt_procedure_t    *proc = NULL;
+        call_frame_t            *frame = NULL;
+        cli_local_t             *local = NULL;
+        xlator_t                *this  = NULL;
+
+        this = THIS;
+        if (wordcount != 3) {
+                cli_usage_out (word->pattern);
+                parse_error = 1;
+                goto out;
+        }
+
+        proc = &cli_rpc_prog->proctable[GLUSTER_CLI_UUID_GET];
+        frame = create_frame (this, this->ctx->pool);
+        if (!frame)
+                goto out;
+
+        dict = dict_new ();
+        if (!dict)
+                goto out;
+
+        CLI_LOCAL_INIT (local, words, frame, dict);
+        if (proc->fn)
+                ret = proc->fn (frame, this, dict);
+
+out:
+        if (ret) {
+                cli_cmd_sent_status_get (&sent);
+                if ((sent == 0) && (parse_error == 0))
+                        cli_out ("uuid get failed");
+        }
+
+        if (dict)
+                dict_unref (dict);
+
+        CLI_STACK_DESTROY (frame);
+        return ret;
+}
+
+int
+cli_cmd_uuid_reset_cbk (struct cli_state *state, struct cli_cmd_word *word,
+                        const char **words, int wordcount)
+{
+        int                     ret = -1;
+        rpc_clnt_procedure_t    *proc = NULL;
+        call_frame_t            *frame = NULL;
+        int                     sent = 0;
+        int                     parse_error = 0;
+        gf_answer_t             answer = GF_ANSWER_NO;
+        char                    *question = NULL;
+        cli_local_t             *local = NULL;
+        dict_t                  *dict  = NULL;
+        xlator_t                *this  = NULL;
+
+        question = "Resetting uuid changes the uuid of local glusterd. "
+                   "Do you want to continue?";
+
+        if (wordcount != 3) {
+                cli_usage_out (word->pattern);
+                parse_error = 1;
+                goto out;
+        }
+
+        proc = &cli_rpc_prog->proctable[GLUSTER_CLI_UUID_RESET];
+
+        this = THIS;
+        frame = create_frame (this, this->ctx->pool);
+        if (!frame)
+                goto out;
+
+        dict = dict_new ();
+        if (!dict) {
+                ret = -1;
+                goto out;
+        }
+        CLI_LOCAL_INIT (local, words, frame, dict);
+        answer = cli_cmd_get_confirmation (state, question);
+
+        if (GF_ANSWER_NO == answer) {
+                ret = 0;
+                goto out;
+        }
+
+        //send NULL as argument since no dictionary is sent to glusterd
+        if (proc->fn) {
+                ret = proc->fn (frame, this, dict);
+        }
+
+out:
+        if (ret) {
+                cli_cmd_sent_status_get (&sent);
+                if ((sent == 0) && (parse_error == 0))
+                        cli_out ("uuid reset failed");
+        }
+
+        CLI_STACK_DESTROY (frame);
+
+        return ret;
+}
+
 struct cli_cmd cli_system_cmds[] = {
         { "system:: getspec <VOLID>",
           cli_cmd_getspec_cbk,
@@ -313,14 +417,163 @@ struct cli_cmd cli_system_cmds[] = {
           cli_cmd_umount_cbk,
           "request an umount"},
 
+        { "system:: uuid get",
+          cli_cmd_uuid_get_cbk,
+          "get uuid of glusterd"},
+
+        { "system:: uuid reset",
+          cli_cmd_uuid_reset_cbk,
+          "reset the uuid of glusterd"},
+
         { "system:: help",
            cli_cmd_system_help_cbk,
            "display help for system commands"},
 
+        { "system:: copy file [<filename>]",
+           cli_cmd_copy_file_cbk,
+           "Copy file from current node's $working_dir to "
+           "$working_dir of all cluster nodes"},
+
+        { "system:: execute <command> <args>",
+           cli_cmd_sys_exec_cbk,
+           "Execute the command on all the nodes "
+           "in the cluster and display their output."},
+
         { NULL, NULL, NULL }
 };
 
 int
+cli_cmd_sys_exec_cbk (struct cli_state *state, struct cli_cmd_word *word,
+                      const char **words, int wordcount)
+{
+        char                   cmd_arg_name[PATH_MAX] = "";
+        char                  *command                = NULL;
+        char                  *saveptr                = NULL;
+        char                  *tmp                    = NULL;
+        int                    ret                    = -1;
+        int                    i                      = -1;
+        int                    cmd_args_count         = 0;
+        int                    in_cmd_args_count      = 0;
+        rpc_clnt_procedure_t  *proc                   = NULL;
+        call_frame_t          *frame                  = NULL;
+        dict_t                *dict                   = NULL;
+        cli_local_t           *local                  = NULL;
+
+        if (wordcount < 3) {
+                cli_usage_out (word->pattern);
+                goto out;
+        }
+
+        dict = dict_new ();
+        if (!dict)
+                goto out;
+
+        command = strtok_r ((char *)words[2], " ", &saveptr);
+        do {
+                tmp = strtok_r (NULL, " ", &saveptr);
+                if (tmp) {
+                        in_cmd_args_count++;
+                        memset (cmd_arg_name, '\0', sizeof(cmd_arg_name));
+                        snprintf (cmd_arg_name, sizeof(cmd_arg_name),
+                                  "cmd_arg_%d", in_cmd_args_count);
+                        ret = dict_set_str (dict, cmd_arg_name, tmp);
+                        if (ret) {
+                                gf_log ("", GF_LOG_ERROR, "Unable to set "
+                                        "%s in dict", cmd_arg_name);
+                                goto out;
+                        }
+                }
+        } while (tmp);
+
+        cmd_args_count = wordcount - 3;
+
+        ret = dict_set_str (dict, "command", command);
+        if (ret) {
+                gf_log ("", GF_LOG_ERROR, "Unable to set command in dict");
+                goto out;
+        }
+
+        for (i=1; i <= cmd_args_count; i++) {
+                in_cmd_args_count++;
+                memset (cmd_arg_name, '\0', sizeof(cmd_arg_name));
+                snprintf (cmd_arg_name, sizeof(cmd_arg_name),
+                          "cmd_arg_%d", in_cmd_args_count);
+                ret = dict_set_str (dict, cmd_arg_name,
+                                    (char *)words[2+i]);
+                if (ret) {
+                        gf_log ("", GF_LOG_ERROR, "Unable to set %s in dict",
+                               cmd_arg_name);
+                        goto out;
+                }
+        }
+
+        ret = dict_set_int32 (dict, "cmd_args_count", in_cmd_args_count);
+        if (ret) {
+                gf_log ("", GF_LOG_ERROR,
+                        "Unable to set cmd_args_count in dict");
+                goto out;
+        }
+
+        ret = dict_set_str (dict, "volname", "N/A");
+        if (ret) {
+                gf_log ("", GF_LOG_ERROR, "Unable to set volname in dict");
+                goto out;
+        }
+
+        proc = &cli_rpc_prog->proctable[GLUSTER_CLI_SYS_EXEC];
+        if (proc && proc->fn) {
+                frame = create_frame (THIS, THIS->ctx->pool);
+                if (!frame)
+                        goto out;
+                CLI_LOCAL_INIT (local, words, frame, dict);
+                ret = proc->fn (frame, THIS, (void*)dict);
+        }
+out:
+        return ret;
+}
+
+int
+cli_cmd_copy_file_cbk (struct cli_state *state, struct cli_cmd_word *word,
+                       const char **words, int wordcount)
+{
+        int                    ret      = -1;
+        rpc_clnt_procedure_t  *proc     = NULL;
+        call_frame_t          *frame    = NULL;
+        char                  *filename = "";
+        dict_t                *dict     = NULL;
+        cli_local_t           *local    = NULL;
+
+        if (wordcount != 4) {
+                cli_usage_out (word->pattern);
+                goto out;
+        }
+
+        dict = dict_new ();
+        if (!dict)
+                goto out;
+
+        filename = (char*)words[3];
+        ret = dict_set_str (dict, "source", filename);
+        if (ret)
+                 gf_log ("", GF_LOG_ERROR, "Unable to set filename in dict");
+
+        ret = dict_set_str (dict, "volname", "N/A");
+        if (ret)
+                 gf_log ("", GF_LOG_ERROR, "Unable to set volname in dict");
+
+        proc = &cli_rpc_prog->proctable[GLUSTER_CLI_COPY_FILE];
+        if (proc && proc->fn) {
+                frame = create_frame (THIS, THIS->ctx->pool);
+                if (!frame)
+                        goto out;
+                CLI_LOCAL_INIT (local, words, frame, dict);
+                ret = proc->fn (frame, THIS, (void*)dict);
+        }
+out:
+        return ret;
+}
+
+int
 cli_cmd_system_help_cbk (struct cli_state *state, struct cli_cmd_word *in_word,
                          const char **words, int wordcount)
 {
diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c
index 9b85bf819..100be0b73 100644
--- a/cli/src/cli-cmd-volume.c
+++ b/cli/src/cli-cmd-volume.c
@@ -1,22 +1,12 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -88,9 +78,9 @@ cli_cmd_volume_info_cbk (struct cli_state *state, struct cli_cmd_word *word,
         if (!local)
                 goto out;
 
-        local->u.get_vol.flags = ctx.flags;
+        local->get_vol.flags = ctx.flags;
         if (ctx.volname)
-                local->u.get_vol.volname = gf_strdup (ctx.volname);
+                local->get_vol.volname = gf_strdup (ctx.volname);
 
         frame->local = local;
 
@@ -105,6 +95,8 @@ out:
                         cli_out ("Getting Volume information failed!");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 
 }
@@ -116,9 +108,15 @@ cli_cmd_sync_volume_cbk (struct cli_state *state, struct cli_cmd_word *word,
         int                     ret = -1;
         rpc_clnt_procedure_t    *proc = NULL;
         call_frame_t            *frame = NULL;
-        gf1_cli_sync_volume_req req = {0,};
         int                     sent = 0;
         int                     parse_error = 0;
+        dict_t                  *dict = NULL;
+        cli_local_t             *local = NULL;
+        gf_answer_t             answer = GF_ANSWER_NO;
+        const char              *question = "Sync volume may make data "
+                                            "inaccessible while the sync "
+                                            "is in progress. Do you want "
+                                            "to continue?";
 
         if ((wordcount < 3) || (wordcount > 4)) {
                 cli_usage_out (word->pattern);
@@ -126,14 +124,40 @@ cli_cmd_sync_volume_cbk (struct cli_state *state, struct cli_cmd_word *word,
                 goto out;
         }
 
+        dict = dict_new ();
+        if (!dict)
+                goto out;
+
         if ((wordcount == 3) || !strcmp(words[3], "all")) {
-                req.flags = GF_CLI_SYNC_ALL;
-                req.volname = "";
+                ret = dict_set_int32 (dict, "flags", (int32_t)
+                                      GF_CLI_SYNC_ALL);
+                if (ret) {
+                        gf_log (THIS->name, GF_LOG_ERROR, "failed to set"
+                                "flag");
+                        goto out;
+                }
         } else {
-                req.volname = (char *)words[3];
+                ret = dict_set_str (dict, "volname", (char *) words[3]);
+                if (ret) {
+                        gf_log (THIS->name, GF_LOG_ERROR, "failed to set "
+                                "volume");
+                        goto out;
+                }
         }
 
-        req.hostname = (char *)words[2];
+        ret = dict_set_str (dict, "hostname", (char *) words[2]);
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_ERROR, "failed to set hostname");
+                goto out;
+        }
+
+        if (!(state->mode & GLUSTER_MODE_SCRIPT)) {
+                answer = cli_cmd_get_confirmation (state, question);
+                if (GF_ANSWER_NO == answer) {
+                        ret = 0;
+                        goto out;
+                }
+        }
 
         proc = &cli_rpc_prog->proctable[GLUSTER_CLI_SYNC_VOLUME];
 
@@ -141,8 +165,10 @@ cli_cmd_sync_volume_cbk (struct cli_state *state, struct cli_cmd_word *word,
         if (!frame)
                 goto out;
 
+        CLI_LOCAL_INIT (local, words, frame, dict);
+
         if (proc->fn) {
-                ret = proc->fn (frame, THIS, &req);
+                ret = proc->fn (frame, THIS, dict);
         }
 
 out:
@@ -152,6 +178,8 @@ out:
                         cli_out ("Volume sync failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -292,13 +320,11 @@ found_bad_brick_order:
 out:
         ai_list_tmp2 = NULL;
         i = 0;
-        if (brick_list_dup)
-                GF_FREE (brick_list_dup);
+        GF_FREE (brick_list_dup);
         list_for_each_entry (ai_list_tmp1, &ai_list->list, list) {
                 if (ai_list_tmp1->info)
                         freeaddrinfo (ai_list_tmp1->info);
-                if (ai_list_tmp2)
-                        free (ai_list_tmp2);
+                free (ai_list_tmp2);
                 ai_list_tmp2 = ai_list_tmp1;
         }
         free (ai_list_tmp2);
@@ -319,7 +345,7 @@ cli_cmd_volume_create_cbk (struct cli_state *state, struct cli_cmd_word *word,
         int32_t                 brick_count = 0;
         int32_t                 sub_count = 0;
         int32_t                 type = GF_CLUSTER_TYPE_NONE;
-
+        cli_local_t             *local = NULL;
 
         proc = &cli_rpc_prog->proctable[GLUSTER_CLI_CREATE_VOLUME];
 
@@ -365,19 +391,31 @@ cli_cmd_volume_create_cbk (struct cli_state *state, struct cli_cmd_word *word,
                         goto out;
                 }
         }
+
+        if (state->mode & GLUSTER_MODE_SCRIPT) {
+                ret = dict_set_int32 (options, "force", _gf_true);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to set force "
+                                "option");
+                        goto out;
+                }
+        }
+
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, options);
         }
 
 out:
-        if (options)
-                dict_unref (options);
         if (ret) {
                 cli_cmd_sent_status_get (&sent);
                 if ((sent == 0) && (parse_error == 0))
                         cli_out ("Volume create failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -394,6 +432,8 @@ cli_cmd_volume_delete_cbk (struct cli_state *state, struct cli_cmd_word *word,
         const char              *question = NULL;
         int                     sent = 0;
         int                     parse_error = 0;
+        cli_local_t             *local = NULL;
+        dict_t                  *dict = NULL;
 
         question = "Deleting volume will erase all information about the volume. "
                    "Do you want to continue?";
@@ -403,6 +443,10 @@ cli_cmd_volume_delete_cbk (struct cli_state *state, struct cli_cmd_word *word,
         if (!frame)
                 goto out;
 
+        dict = dict_new ();
+        if (!dict)
+                goto out;
+
         if (wordcount != 3) {
                 cli_usage_out (word->pattern);
                 parse_error = 1;
@@ -418,8 +462,17 @@ cli_cmd_volume_delete_cbk (struct cli_state *state, struct cli_cmd_word *word,
 
         volname = (char *)words[2];
 
+        ret = dict_set_str (dict, "volname", volname);
+
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_WARNING, "dict set failed");
+                goto out;
+        }
+
+        CLI_LOCAL_INIT (local, words, frame, dict);
+
         if (proc->fn) {
-                ret = proc->fn (frame, THIS, volname);
+                ret = proc->fn (frame, THIS, dict);
         }
 
 out:
@@ -429,6 +482,8 @@ out:
                         cli_out ("Volume delete failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -439,9 +494,11 @@ cli_cmd_volume_start_cbk (struct cli_state *state, struct cli_cmd_word *word,
         int                     ret = -1;
         rpc_clnt_procedure_t    *proc = NULL;
         call_frame_t            *frame = NULL;
-        gf1_cli_start_vol_req    req = {0,};
         int                     sent = 0;
         int                     parse_error = 0;
+        dict_t                  *dict = NULL;
+        int                     flags = 0;
+        cli_local_t             *local = NULL;
 
         frame = create_frame (THIS, THIS->ctx->pool);
         if (!frame)
@@ -453,13 +510,23 @@ cli_cmd_volume_start_cbk (struct cli_state *state, struct cli_cmd_word *word,
                goto out;
         }
 
-        req.volname = (char *)words[2];
-        if (!req.volname)
+        dict = dict_new ();
+        if (!dict) {
+                goto out;
+        }
+
+        if (!words[2])
                 goto out;
 
+        ret = dict_set_str (dict, "volname", (char *)words[2]);
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_ERROR, "dict set failed");
+                goto out;
+        }
+
         if (wordcount == 4) {
                 if (!strcmp("force", words[3])) {
-                        req.flags |= GF_CLI_FLAG_OP_FORCE;
+                        flags |= GF_CLI_FLAG_OP_FORCE;
                 } else {
                         ret = -1;
                         cli_usage_out (word->pattern);
@@ -467,11 +534,25 @@ cli_cmd_volume_start_cbk (struct cli_state *state, struct cli_cmd_word *word,
                         goto out;
                 }
         }
+        ret = dict_set_int32 (dict, "flags", flags);
+        if (ret) {
+                 gf_log (THIS->name, GF_LOG_ERROR,
+                         "dict set failed");
+                 goto out;
+        }
+
+        if (ret < 0) {
+                gf_log (THIS->name, GF_LOG_ERROR,
+                        "failed to serialize dict");
+                goto out;
+        }
 
         proc = &cli_rpc_prog->proctable[GLUSTER_CLI_START_VOLUME];
 
+        CLI_LOCAL_INIT (local, words, frame, dict);
+
         if (proc->fn) {
-                ret = proc->fn (frame, THIS, &req);
+                ret = proc->fn (frame, THIS, dict);
         }
 
 out:
@@ -481,6 +562,8 @@ out:
                         cli_out ("Volume start failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -489,7 +572,7 @@ cli_cmd_get_confirmation (struct cli_state *state, const char *question)
 {
         char                    answer[5] = {'\0', };
         char                    flush = '\0';
-	int			len = 0;
+        size_t			len;
 
         if (state->mode & GLUSTER_MODE_SCRIPT)
                 return GF_ANSWER_YES;
@@ -503,7 +586,7 @@ cli_cmd_get_confirmation (struct cli_state *state, const char *question)
 
 	len = strlen (answer);
 
-	if (answer [len - 1] == '\n'){
+	if (len && answer [len - 1] == '\n'){
 		answer [--len] = '\0';
 	} else {
 		do{
@@ -534,10 +617,12 @@ cli_cmd_volume_stop_cbk (struct cli_state *state, struct cli_cmd_word *word,
         rpc_clnt_procedure_t    *proc = NULL;
         call_frame_t            *frame = NULL;
         int                     flags   = 0;
-        gf1_cli_stop_vol_req    req = {0,};
         gf_answer_t             answer = GF_ANSWER_NO;
         int                     sent = 0;
         int                     parse_error = 0;
+        dict_t                  *dict = NULL;
+        char                    *volname = NULL;
+        cli_local_t             *local = NULL;
 
         const char *question = "Stopping volume will make its data inaccessible. "
                                "Do you want to continue?";
@@ -552,9 +637,14 @@ cli_cmd_volume_stop_cbk (struct cli_state *state, struct cli_cmd_word *word,
                 goto out;
         }
 
-        req.volname = (char *)words[2];
-        if (!req.volname)
+        volname = (char*) words[2];
+
+        dict = dict_new ();
+        ret = dict_set_str (dict, "volname", volname);
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_ERROR, "dict set failed");
                 goto out;
+        }
 
         if (wordcount == 4) {
                 if (!strcmp("force", words[3])) {
@@ -566,6 +656,12 @@ cli_cmd_volume_stop_cbk (struct cli_state *state, struct cli_cmd_word *word,
                         goto out;
                 }
         }
+        ret = dict_set_int32 (dict, "flags", flags);
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_ERROR,
+                        "dict set failed");
+                goto out;
+        }
 
         answer = cli_cmd_get_confirmation (state, question);
 
@@ -574,20 +670,23 @@ cli_cmd_volume_stop_cbk (struct cli_state *state, struct cli_cmd_word *word,
                 goto out;
         }
 
-        req.flags = flags;
         proc = &cli_rpc_prog->proctable[GLUSTER_CLI_STOP_VOLUME];
 
+        CLI_LOCAL_INIT (local, words, frame, dict);
+
         if (proc->fn) {
-                ret = proc->fn (frame, THIS, &req);
+                ret = proc->fn (frame, THIS, dict);
         }
 
 out:
         if (ret) {
                 cli_cmd_sent_status_get (&sent);
                 if ((sent == 0) && (parse_error == 0))
-                        cli_out ("Volume stop on '%s' failed", req.volname);
+                        cli_out ("Volume stop on '%s' failed", volname);
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -644,6 +743,8 @@ out:
                         cli_out ("Volume rename on '%s' failed", (char *)words[2]);
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -657,7 +758,7 @@ cli_cmd_volume_defrag_cbk (struct cli_state *state, struct cli_cmd_word *word,
         dict_t               *dict = NULL;
         int                     sent = 0;
         int                     parse_error = 0;
-        int                     index = 0;
+        cli_local_t          *local = NULL;
 #ifdef GF_SOLARIS_HOST_OS
         cli_out ("Command not supported on Solaris");
         goto out;
@@ -667,86 +768,30 @@ cli_cmd_volume_defrag_cbk (struct cli_state *state, struct cli_cmd_word *word,
         if (!frame)
                 goto out;
 
-        dict = dict_new ();
-        if (!dict)
-                goto out;
+        ret = cli_cmd_volume_defrag_parse (words, wordcount, &dict);
 
-        if (!((wordcount == 4) || (wordcount == 5) || (wordcount == 6))) {
+        if (ret) {
                 cli_usage_out (word->pattern);
                 parse_error = 1;
-                goto out;
-        }
-
-        if (wordcount == 4) {
-                index = 3;
-        } else {
-                if (strcmp (words[3], "fix-layout") &&
-                    strcmp (words[3], "migrate-data")) {
-                        cli_usage_out (word->pattern);
-                        parse_error = 1;
-                        goto out;
-                }
-                index = 4;
-        }
-
-	if (strcmp (words[index], "start") && strcmp (words[index], "stop") &&
-            strcmp (words[index], "status")) {
-	        cli_usage_out (word->pattern);
-		parse_error = 1;
-		goto out;
-	}
-
-        ret = dict_set_str (dict, "volname", (char *)words[2]);
-        if (ret)
-                goto out;
-
-        if (wordcount == 4) {
-                ret = dict_set_str (dict, "command", (char *)words[3]);
-                if (ret)
-                        goto out;
-        }
-        if (wordcount == 5) {
-                ret = dict_set_str (dict, "start-type", (char *)words[3]);
-                if (ret)
-                        goto out;
-                ret = dict_set_str (dict, "command", (char *)words[4]);
-                if (ret)
-                        goto out;
-        }
-
-        /* 'force' option is valid only for the 'migrate-data' key */
-        if (wordcount == 6) {
-                if (strcmp (words[3], "migrate-data") ||
-                    strcmp (words[4], "start") ||
-                    strcmp (words[5], "force")) {
-                        cli_usage_out (word->pattern);
-                        parse_error = 1;
-                        goto out;
-                }
-                ret = dict_set_str (dict, "start-type", "migrate-data-force");
-                if (ret)
-                        goto out;
-                ret = dict_set_str (dict, "command", (char *)words[4]);
-                if (ret)
-                        goto out;
         }
 
         proc = &cli_rpc_prog->proctable[GLUSTER_CLI_DEFRAG_VOLUME];
 
+        CLI_LOCAL_INIT (local, words, frame, dict);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, dict);
         }
 
 out:
-        if (dict)
-                dict_destroy (dict);
-
         if (ret) {
                 cli_cmd_sent_status_get (&sent);
                 if ((sent == 0) && (parse_error == 0))
                         cli_out ("Volume rebalance failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -756,11 +801,11 @@ cli_cmd_volume_reset_cbk (struct cli_state *state, struct cli_cmd_word *word,
 {
         int                     sent = 0;
         int                     parse_error = 0;
-
         int                     ret = -1;
         rpc_clnt_procedure_t    *proc = NULL;
         call_frame_t            *frame = NULL;
         dict_t                  *options = NULL;
+        cli_local_t             *local = NULL;
 
         proc = &cli_rpc_prog->proctable[GLUSTER_CLI_RESET_VOLUME];
 
@@ -769,27 +814,27 @@ cli_cmd_volume_reset_cbk (struct cli_state *state, struct cli_cmd_word *word,
                 goto out;
 
         ret = cli_cmd_volume_reset_parse (words, wordcount, &options);
-
         if (ret) {
                 cli_usage_out (word->pattern);
                 parse_error = 1;
                 goto out;
         }
 
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, options);
         }
 
 out:
-        if (options)
-                dict_unref (options);
-
         if (ret) {
                 cli_cmd_sent_status_get (&sent);
                 if ((sent == 0) && (parse_error == 0))
                         cli_out ("Volume reset failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 
 }
@@ -805,6 +850,7 @@ cli_cmd_volume_profile_cbk (struct cli_state *state, struct cli_cmd_word *word,
         rpc_clnt_procedure_t    *proc    = NULL;
         call_frame_t            *frame   = NULL;
         dict_t                  *options = NULL;
+        cli_local_t             *local = NULL;
 
         ret = cli_cmd_volume_profile_parse (words, wordcount, &options);
 
@@ -820,20 +866,21 @@ cli_cmd_volume_profile_cbk (struct cli_state *state, struct cli_cmd_word *word,
         if (!frame)
                 goto out;
 
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, options);
         }
 
 out:
-        if (options)
-                dict_unref (options);
-
         if (ret) {
                 cli_cmd_sent_status_get (&sent);
                 if ((sent == 0) && (parse_error == 0))
                         cli_out ("Volume profile failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 
 }
@@ -849,6 +896,8 @@ cli_cmd_volume_set_cbk (struct cli_state *state, struct cli_cmd_word *word,
         rpc_clnt_procedure_t    *proc = NULL;
         call_frame_t            *frame = NULL;
         dict_t                  *options = NULL;
+        cli_local_t             *local = NULL;
+        char                    *op_errstr = NULL;
 
         proc = &cli_rpc_prog->proctable[GLUSTER_CLI_SET_VOLUME];
 
@@ -856,28 +905,33 @@ cli_cmd_volume_set_cbk (struct cli_state *state, struct cli_cmd_word *word,
         if (!frame)
                 goto out;
 
-        ret = cli_cmd_volume_set_parse (words, wordcount, &options);
-
+        ret = cli_cmd_volume_set_parse (words, wordcount, &options, &op_errstr);
         if (ret) {
-                cli_usage_out (word->pattern);
+                if (op_errstr) {
+                    cli_err ("%s", op_errstr);
+                    GF_FREE (op_errstr);
+                } else
+                    cli_usage_out (word->pattern);
+
                 parse_error = 1;
                 goto out;
         }
 
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, options);
         }
 
 out:
-        if (options)
-                dict_unref (options);
-
         if (ret) {
                 cli_cmd_sent_status_get (&sent);
                 if ((sent == 0) && (parse_error == 0))
                         cli_out ("Volume set failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 
 }
@@ -893,35 +947,63 @@ cli_cmd_volume_add_brick_cbk (struct cli_state *state,
         dict_t                  *options = NULL;
         int                     sent = 0;
         int                     parse_error = 0;
+        gf_answer_t             answer = GF_ANSWER_NO;
+        cli_local_t             *local = NULL;
+
+        const char *question = "Changing the 'stripe count' of the volume is "
+                "not a supported feature. In some cases it may result in data "
+                "loss on the volume. Also there may be issues with regular "
+                "filesystem operations on the volume after the change. Do you "
+                "really want to continue with 'stripe' count option ? ";
 
         frame = create_frame (THIS, THIS->ctx->pool);
         if (!frame)
                 goto out;
 
         ret = cli_cmd_volume_add_brick_parse (words, wordcount, &options);
-
         if (ret) {
                 cli_usage_out (word->pattern);
                 parse_error = 1;
                 goto out;
         }
 
+        /* TODO: there are challenges in supporting changing of
+           stripe-count, untill it is properly supported give warning to user */
+        if (dict_get (options, "stripe-count")) {
+                answer = cli_cmd_get_confirmation (state, question);
+
+                if (GF_ANSWER_NO == answer) {
+                        ret = 0;
+                        goto out;
+                }
+        }
+
+        if (state->mode & GLUSTER_MODE_SCRIPT) {
+                ret = dict_set_int32 (options, "force", _gf_true);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to set force "
+                                "option");
+                        goto out;
+                }
+        }
+
         proc = &cli_rpc_prog->proctable[GLUSTER_CLI_ADD_BRICK];
 
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, options);
         }
 
 out:
-        if (options)
-                dict_unref (options);
-
         if (ret) {
                 cli_cmd_sent_status_get (&sent);
                 if ((sent == 0) && (parse_error == 0))
                         cli_out ("Volume add-brick failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -937,6 +1019,7 @@ cli_cmd_quota_cbk (struct cli_state *state, struct cli_cmd_word *word,
         call_frame_t            *frame     = NULL;
         dict_t                  *options   = NULL;
         gf_answer_t              answer    = GF_ANSWER_NO;
+        cli_local_t             *local     = NULL;
         const char *question = "Disabling quota will delete all the quota "
                                "configuration. Do you want to continue?";
 
@@ -953,6 +1036,7 @@ cli_cmd_quota_cbk (struct cli_state *state, struct cli_cmd_word *word,
         }
 
         ret = cli_cmd_quota_parse (words, wordcount, &options);
+
         if (ret < 0) {
                 cli_usage_out (word->pattern);
                 parse_err = 1;
@@ -964,16 +1048,17 @@ cli_cmd_quota_cbk (struct cli_state *state, struct cli_cmd_word *word,
                         goto out;
         }
 
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn)
                 ret = proc->fn (frame, THIS, options);
 
 out:
-        if (options)
-                dict_unref (options);
-
         if (ret && parse_err == 0)
                 cli_out ("Quota command failed");
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 
 }
@@ -991,6 +1076,7 @@ cli_cmd_volume_remove_brick_cbk (struct cli_state *state,
         int                     sent = 0;
         int                     parse_error = 0;
         int                     need_question = 0;
+        cli_local_t             *local = NULL;
 
         const char *question = "Removing brick(s) can result in data loss. "
                                "Do you want to Continue?";
@@ -1001,7 +1087,6 @@ cli_cmd_volume_remove_brick_cbk (struct cli_state *state,
 
         ret = cli_cmd_volume_remove_brick_parse (words, wordcount, &options,
                                                  &need_question);
-
         if (ret) {
                 cli_usage_out (word->pattern);
                 parse_error = 1;
@@ -1019,6 +1104,8 @@ cli_cmd_volume_remove_brick_cbk (struct cli_state *state,
 
         proc = &cli_rpc_prog->proctable[GLUSTER_CLI_REMOVE_BRICK];
 
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, options);
         }
@@ -1030,8 +1117,8 @@ out:
                         cli_out ("Volume remove-brick failed");
         }
 
-        if (options)
-                dict_unref (options);
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 
 }
@@ -1048,6 +1135,7 @@ cli_cmd_volume_replace_brick_cbk (struct cli_state *state,
         dict_t                  *options = NULL;
         int                     sent = 0;
         int                     parse_error = 0;
+        cli_local_t             *local = NULL;
 
 #ifdef GF_SOLARIS_HOST_OS
         cli_out ("Command not supported on Solaris");
@@ -1067,20 +1155,30 @@ cli_cmd_volume_replace_brick_cbk (struct cli_state *state,
                 goto out;
         }
 
+        if (state->mode & GLUSTER_MODE_SCRIPT) {
+                ret = dict_set_int32 (options, "force", _gf_true);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to set force"
+                                "option");
+                        goto out;
+                }
+        }
+
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, options);
         }
 
 out:
-        if (options)
-                dict_unref (options);
-
         if (ret) {
                 cli_cmd_sent_status_get (&sent);
                 if ((sent == 0) && (parse_error == 0))
                         cli_out ("Volume replace-brick failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -1095,50 +1193,6 @@ cli_cmd_volume_set_transport_cbk (struct cli_state *state,
 }
 
 int
-cli_cmd_log_filename_cbk (struct cli_state *state, struct cli_cmd_word *word,
-                          const char **words, int wordcount)
-{
-        int                     ret = -1;
-        rpc_clnt_procedure_t    *proc = NULL;
-        call_frame_t            *frame = NULL;
-        dict_t                  *options = NULL;
-        int                     sent = 0;
-        int                     parse_error = 0;
-
-        if (!((wordcount == 5) || (wordcount == 6))) {
-                cli_usage_out (word->pattern);
-                parse_error = 1;
-                goto out;
-        }
-
-        proc = &cli_rpc_prog->proctable[GLUSTER_CLI_LOG_FILENAME];
-
-        frame = create_frame (THIS, THIS->ctx->pool);
-        if (!frame)
-                goto out;
-
-        ret = cli_cmd_log_filename_parse (words, wordcount, &options);
-        if (ret)
-                goto out;
-
-        if (proc->fn) {
-                ret = proc->fn (frame, THIS, options);
-        }
-
-out:
-        if (options)
-                dict_destroy (options);
-
-        if (ret) {
-                cli_cmd_sent_status_get (&sent);
-                if ((sent == 0) && (parse_error == 0))
-                        cli_out ("Volume log filename failed");
-        }
-
-        return ret;
-}
-
-int
 cli_cmd_volume_top_cbk (struct cli_state *state, struct cli_cmd_word *word,
                           const char **words, int wordcount)
 {
@@ -1149,6 +1203,7 @@ cli_cmd_volume_top_cbk (struct cli_state *state, struct cli_cmd_word *word,
         dict_t                  *options = NULL;
         int                     sent     = 0;
         int                     parse_error = 0;
+        cli_local_t             *local = NULL;
 
         ret = cli_cmd_volume_top_parse (words, wordcount, &options);
 
@@ -1164,67 +1219,25 @@ cli_cmd_volume_top_cbk (struct cli_state *state, struct cli_cmd_word *word,
         if (!frame)
                 goto out;
 
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, options);
         }
 
 out:
-        if (options)
-                dict_unref (options);
-
         if (ret) {
                 cli_cmd_sent_status_get (&sent);
                 if ((sent == 0) && (parse_error == 0))
                         cli_out ("Volume top failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 
 }
 
-int
-cli_cmd_log_locate_cbk (struct cli_state *state, struct cli_cmd_word *word,
-                        const char **words, int wordcount)
-{
-        int                     ret = -1;
-        rpc_clnt_procedure_t    *proc = NULL;
-        call_frame_t            *frame = NULL;
-        dict_t                  *options = NULL;
-        int                     sent = 0;
-        int                     parse_error = 0;
-
-        if (!((wordcount == 4) || (wordcount == 5))) {
-                cli_usage_out (word->pattern);
-                parse_error = 1;
-                goto out;
-        }
-
-        proc = &cli_rpc_prog->proctable[GLUSTER_CLI_LOG_LOCATE];
-
-        frame = create_frame (THIS, THIS->ctx->pool);
-        if (!frame)
-                goto out;
-
-        ret = cli_cmd_log_locate_parse (words, wordcount, &options);
-        if (ret)
-                goto out;
-
-        if (proc->fn) {
-                ret = proc->fn (frame, THIS, options);
-        }
-
-out:
-        if (options)
-                dict_destroy (options);
-
-        if (ret) {
-                cli_cmd_sent_status_get (&sent);
-                if ((sent == 0) && (parse_error == 0))
-                        cli_out ("getting log file location information failed");
-        }
-
-        return ret;
-}
 
 int
 cli_cmd_log_rotate_cbk (struct cli_state *state, struct cli_cmd_word *word,
@@ -1236,6 +1249,7 @@ cli_cmd_log_rotate_cbk (struct cli_state *state, struct cli_cmd_word *word,
         dict_t                  *options = NULL;
         int                     sent = 0;
         int                     parse_error = 0;
+        cli_local_t             *local = NULL;
 
         if (!((wordcount == 4) || (wordcount == 5))) {
                 cli_usage_out (word->pattern);
@@ -1253,19 +1267,19 @@ cli_cmd_log_rotate_cbk (struct cli_state *state, struct cli_cmd_word *word,
         if (ret)
                 goto out;
 
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, options);
         }
 
 out:
-        if (options)
-                dict_destroy (options);
-
         if (ret) {
                 cli_cmd_sent_status_get (&sent);
                 if ((sent == 0) && (parse_error == 0))
                         cli_out ("Volume log rotate failed");
         }
+        CLI_STACK_DESTROY (frame);
 
         return ret;
 }
@@ -1339,6 +1353,7 @@ cli_cmd_volume_gsync_set_cbk (struct cli_state *state, struct cli_cmd_word *word
         dict_t                  *options = NULL;
         rpc_clnt_procedure_t    *proc    = NULL;
         call_frame_t            *frame   = NULL;
+        cli_local_t             *local   = NULL;
 
         proc = &cli_rpc_prog->proctable [GLUSTER_CLI_GSYNC_SET];
         if (proc == NULL) {
@@ -1359,109 +1374,249 @@ cli_cmd_volume_gsync_set_cbk (struct cli_state *state, struct cli_cmd_word *word
                 goto out;
         }
 
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn)
                 ret = proc->fn (frame, THIS, options);
 
 out:
-        if (options)
-                dict_unref (options);
-
         if (ret && parse_err == 0)
                 cli_out (GEOREP" command failed");
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
 int
-cli_cmd_log_level_cbk (struct cli_state *state, struct cli_cmd_word *word,
-                       const char **words, int wordcount)
+cli_cmd_volume_status_cbk (struct cli_state *state,
+                           struct cli_cmd_word *word,
+                           const char **words, int wordcount)
 {
         int                   ret         = -1;
         rpc_clnt_procedure_t *proc        = NULL;
         call_frame_t         *frame       = NULL;
         dict_t               *dict        = NULL;
+        uint32_t              cmd         = 0;
+        cli_local_t          *local       = NULL;
+
+        ret = cli_cmd_volume_status_parse (words, wordcount, &dict);
+
+        if (ret) {
+                cli_usage_out (word->pattern);
+                goto out;
+        }
+
+        ret = dict_get_uint32 (dict, "cmd", &cmd);
+        if (ret)
+                goto out;
 
-        if (wordcount != 6) {
-          cli_usage_out (word->pattern);
-          goto out;
+        if (!(cmd & GF_CLI_STATUS_ALL)) {
+                /* for one volume or brick */
+                proc = &cli_rpc_prog->proctable[GLUSTER_CLI_STATUS_VOLUME];
+        } else {
+                /* volume status all or all detail */
+                proc = &cli_rpc_prog->proctable[GLUSTER_CLI_STATUS_ALL];
         }
 
-        proc = &cli_rpc_prog->proctable[GLUSTER_CLI_LOG_LEVEL];
+        if (!proc->fn)
+                goto out;
 
         frame = create_frame (THIS, THIS->ctx->pool);
         if (!frame)
-          goto out;
+                goto out;
 
-        ret = cli_cmd_log_level_parse (words, wordcount, &dict);
-        if (ret)
-          goto out;
+        CLI_LOCAL_INIT (local, words, frame, dict);
 
-        if (proc->fn)
-          ret = proc->fn (frame, THIS, dict);
+        ret = proc->fn (frame, THIS, dict);
+
+out:
+        CLI_STACK_DESTROY (frame);
 
- out:
         return ret;
 }
 
+
 int
-cli_cmd_volume_status_cbk (struct cli_state *state,
-                              struct cli_cmd_word *word,
-                              const char **words, int wordcount)
+cli_get_detail_status (dict_t *dict, int i, cli_volume_status_t *status)
 {
-        int                   ret         = -1;
-        rpc_clnt_procedure_t *proc        = NULL;
-        call_frame_t         *frame       = NULL;
-        dict_t               *dict        = NULL;
+        uint64_t                   free            = 0;
+        uint64_t                   total           = 0;
+        char                       key[1024]       = {0};
+        int                        ret             = 0;
 
-        if (wordcount != 3) {
-                cli_usage_out (word->pattern);
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.free", i);
+        ret = dict_get_uint64 (dict, key, &free);
+
+        status->free = gf_uint64_2human_readable (free);
+        if (!status->free)
                 goto out;
-        }
 
-        proc = &cli_rpc_prog->proctable[GLUSTER_CLI_STATUS_VOLUME];
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.total", i);
+        ret = dict_get_uint64 (dict, key, &total);
 
-        frame = create_frame (THIS, THIS->ctx->pool);
-        if (!frame)
+        status->total = gf_uint64_2human_readable (total);
+        if (!status->total)
                 goto out;
 
-        ret = cli_cmd_volume_status_parse (words, wordcount, &dict);
+#ifdef GF_LINUX_HOST_OS
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.device", i);
+        ret = dict_get_str (dict, key, &(status->device));
         if (ret)
-                goto out;
+                status->device = NULL;
+#endif
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.block_size", i);
+        ret = dict_get_uint64 (dict, key, &(status->block_size));
+        if (ret) {
+                ret = 0;
+                status->block_size = 0;
+        }
+
+#ifdef GF_LINUX_HOST_OS
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.mnt_options", i);
+        ret = dict_get_str (dict, key, &(status->mount_options));
+        if (ret)
+                status->mount_options = NULL;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.fs_name", i);
+        ret = dict_get_str (dict, key, &(status->fs_name));
+        if (ret) {
+                ret = 0;
+                status->fs_name = NULL;
+        }
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.inode_size", i);
+        ret = dict_get_str (dict, key, &(status->inode_size));
+        if (ret)
+                status->inode_size = NULL;
+#endif /* GF_LINUX_HOST_OS */
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.total_inodes", i);
+        ret = dict_get_uint64 (dict, key,
+                        &(status->total_inodes));
+        if (ret)
+                status->total_inodes = 0;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.free_inodes", i);
+        ret = dict_get_uint64 (dict, key, &(status->free_inodes));
+        if (ret) {
+                ret = 0;
+                status->free_inodes = 0;
+        }
 
-        if (proc->fn)
-                ret = proc->fn (frame, THIS, dict);
 
  out:
         return ret;
 }
 
+void
+cli_print_detailed_status (cli_volume_status_t *status)
+{
+        cli_out ("%-20s : %-20s", "Brick", status->brick);
+        if (status->online)
+                cli_out ("%-20s : %-20d", "Port", status->port);
+        else
+                cli_out ("%-20s : %-20s", "Port", "N/A");
+        cli_out ("%-20s : %-20c", "Online", (status->online) ? 'Y' : 'N');
+        cli_out ("%-20s : %-20s", "Pid", status->pid_str);
+
+#ifdef GF_LINUX_HOST_OS
+        if (status->fs_name)
+                cli_out ("%-20s : %-20s", "File System", status->fs_name);
+        else
+                cli_out ("%-20s : %-20s", "File System", "N/A");
+
+        if (status->device)
+                cli_out ("%-20s : %-20s", "Device", status->device);
+        else
+                cli_out ("%-20s : %-20s", "Device", "N/A");
+
+        if (status->mount_options) {
+                cli_out ("%-20s : %-20s", "Mount Options",
+                         status->mount_options);
+        } else {
+                cli_out ("%-20s : %-20s", "Mount Options", "N/A");
+        }
+
+        if (status->inode_size) {
+                cli_out ("%-20s : %-20s", "Inode Size",
+                         status->inode_size);
+        } else {
+                cli_out ("%-20s : %-20s", "Inode Size", "N/A");
+        }
+#endif
+        if (status->free)
+                cli_out ("%-20s : %-20s", "Disk Space Free", status->free);
+        else
+                cli_out ("%-20s : %-20s", "Disk Space Free", "N/A");
+
+        if (status->total)
+                cli_out ("%-20s : %-20s", "Total Disk Space", status->total);
+        else
+                cli_out ("%-20s : %-20s", "Total Disk Space", "N/A");
+
+
+        if (status->total_inodes) {
+                cli_out ("%-20s : %-20ld", "Inode Count",
+                         status->total_inodes);
+        } else {
+                cli_out ("%-20s : %-20s", "Inode Count", "N/A");
+        }
+
+        if (status->free_inodes) {
+                cli_out ("%-20s : %-20ld", "Free Inodes",
+                         status->free_inodes);
+        } else {
+                cli_out ("%-20s : %-20s", "Free Inodes", "N/A");
+        }
+}
 
 int
-cli_print_brick_status (char *brick, int port, int online, int pid)
+cli_print_brick_status (cli_volume_status_t *status)
 {
         int  fieldlen = CLI_VOL_STATUS_BRICK_LEN;
-        char buf[80] = {0,};
         int  bricklen = 0;
-        int  i = 0;
         char *p = NULL;
         int  num_tabs = 0;
 
-        bricklen = strlen (brick);
-        p = brick;
+        p = status->brick;
+        bricklen = strlen (p);
         while (bricklen > 0) {
                 if (bricklen > fieldlen) {
-                        i++;
-                        strncpy (buf, p, fieldlen);
-                        buf[strlen(buf) + 1] = '\0';
-                        cli_out ("%s", buf);
-                        p = brick + i * fieldlen;
+                        cli_out ("%.*s", fieldlen, p);
+                        p += fieldlen;
                         bricklen -= fieldlen;
                 } else {
                         num_tabs = (fieldlen - bricklen) / CLI_TAB_LENGTH + 1;
                         printf ("%s", p);
                         while (num_tabs-- != 0)
                                 printf ("\t");
-                        cli_out ("%d\t%c\t%d", port, online?'Y':'N', pid);
+                        if (status->port) {
+                                if (status->online)
+                                        cli_out ("%d\t%c\t%s",
+                                                 status->port,
+                                                 status->online?'Y':'N',
+                                                 status->pid_str);
+                                else
+                                        cli_out ("%s\t%c\t%s",
+                                                 "N/A",
+                                                 status->online?'Y':'N',
+                                                 status->pid_str);
+                        }
+                        else
+                                cli_out ("%s\t%c\t%s",
+                                         "N/A", status->online?'Y':'N',
+                                         status->pid_str);
                         bricklen = 0;
                 }
         }
@@ -1476,28 +1631,36 @@ cli_cmd_volume_heal_cbk (struct cli_state *state, struct cli_cmd_word *word,
         int                     ret = -1;
         rpc_clnt_procedure_t    *proc = NULL;
         call_frame_t            *frame = NULL;
-        gf1_cli_heal_vol_req    req = {0,};
         int                     sent = 0;
         int                     parse_error = 0;
+        dict_t                  *options = NULL;
+        xlator_t                *this = NULL;
+        cli_local_t             *local = NULL;
 
-        frame = create_frame (THIS, THIS->ctx->pool);
+        this = THIS;
+        frame = create_frame (this, this->ctx->pool);
         if (!frame)
                 goto out;
 
-        if (wordcount != 3) {
+        if (wordcount < 3) {
                cli_usage_out (word->pattern);
-                parse_error = 1;
+               parse_error = 1;
                goto out;
         }
 
-        req.volname = (char *)words[2];
-        if (!req.volname)
+        ret = cli_cmd_volume_heal_options_parse (words, wordcount, &options);
+        if (ret) {
+                cli_usage_out (word->pattern);
+                parse_error = 1;
                 goto out;
+        }
 
         proc = &cli_rpc_prog->proctable[GLUSTER_CLI_HEAL_VOLUME];
 
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn) {
-                ret = proc->fn (frame, THIS, &req);
+                ret = proc->fn (frame, THIS, options);
         }
 
 out:
@@ -1507,6 +1670,8 @@ out:
                         cli_out ("Volume heal failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
@@ -1520,6 +1685,7 @@ cli_cmd_volume_statedump_cbk (struct cli_state *state, struct cli_cmd_word *word
         dict_t                          *options = NULL;
         int                             sent = 0;
         int                             parse_error = 0;
+        cli_local_t                     *local = NULL;
 
         frame = create_frame (THIS, THIS->ctx->pool);
         if (!frame)
@@ -1531,7 +1697,7 @@ cli_cmd_volume_statedump_cbk (struct cli_state *state, struct cli_cmd_word *word
                 goto out;
         }
 
-        if (wordcount > 3) {
+        if (wordcount >= 3) {
                ret = cli_cmd_volume_statedump_options_parse (words, wordcount,
                                                               &options);
                if (ret) {
@@ -1541,19 +1707,6 @@ cli_cmd_volume_statedump_cbk (struct cli_state *state, struct cli_cmd_word *word
                        cli_out ("Error parsing options");
                        cli_usage_out (word->pattern);
                }
-        } else {
-                options = dict_new ();
-                if (!options) {
-                        ret = -1;
-                        gf_log ("cli", GF_LOG_ERROR, "Could not create dict");
-                        goto out;
-                }
-                ret = dict_set_str (options, "options","");
-                if (ret)
-                        goto out;
-                ret = dict_set_int32 (options, "option-cnt", 0);
-                if (ret)
-                        goto out;
         }
 
         ret = dict_set_str (options, "volname", (char *)words[2]);
@@ -1561,6 +1714,9 @@ cli_cmd_volume_statedump_cbk (struct cli_state *state, struct cli_cmd_word *word
                 goto out;
 
         proc = &cli_rpc_prog->proctable[GLUSTER_CLI_STATEDUMP_VOLUME];
+
+        CLI_LOCAL_INIT (local, words, frame, options);
+
         if (proc->fn) {
                 ret = proc->fn (frame, THIS, options);
         }
@@ -1572,16 +1728,113 @@ out:
                         cli_out ("Volume statedump failed");
         }
 
+        CLI_STACK_DESTROY (frame);
+
         return ret;
 }
 
+int
+cli_cmd_volume_list_cbk (struct cli_state *state, struct cli_cmd_word *word,
+                         const char **words, int wordcount)
+{
+        int                     ret = -1;
+        call_frame_t            *frame = NULL;
+        rpc_clnt_procedure_t    *proc = NULL;
+        int                     sent = 0;
+
+        frame = create_frame (THIS, THIS->ctx->pool);
+        if (!frame)
+                goto out;
+
+        proc = &cli_rpc_prog->proctable[GLUSTER_CLI_LIST_VOLUME];
+        if (proc->fn) {
+                ret = proc->fn (frame, THIS, NULL);
+        }
+
+out:
+        if (ret) {
+                cli_cmd_sent_status_get (&sent);
+                if (sent == 0)
+                        cli_out ("Volume list failed");
+        }
+
+        CLI_STACK_DESTROY (frame);
+
+        return ret;
+}
+
+int
+cli_cmd_volume_clearlocks_cbk (struct cli_state *state,
+                               struct cli_cmd_word *word,
+                               const char **words, int wordcount)
+{
+        int                             ret = -1;
+        rpc_clnt_procedure_t            *proc = NULL;
+        call_frame_t                    *frame = NULL;
+        dict_t                          *options = NULL;
+        int                             sent = 0;
+        int                             parse_error = 0;
+        cli_local_t                     *local = NULL;
+
+        frame = create_frame (THIS, THIS->ctx->pool);
+        if (!frame)
+                goto out;
+
+        if (wordcount < 7 || wordcount > 8) {
+                cli_usage_out (word->pattern);
+                parse_error = 1;
+                goto out;
+        }
+
+       ret = cli_cmd_volume_clrlks_opts_parse (words, wordcount, &options);
+       if (ret) {
+               parse_error = 1;
+               gf_log ("cli", GF_LOG_ERROR, "Error parsing "
+                       "clear-locks options");
+               cli_out ("Error parsing options");
+               cli_usage_out (word->pattern);
+       }
+
+        ret = dict_set_str (options, "volname", (char *)words[2]);
+        if (ret)
+                goto out;
+
+        ret = dict_set_str (options, "path", (char *)words[3]);
+        if (ret)
+                goto out;
+
+        proc = &cli_rpc_prog->proctable[GLUSTER_CLI_CLRLOCKS_VOLUME];
+
+        CLI_LOCAL_INIT (local, words, frame, options);
+
+        if (proc->fn) {
+                ret = proc->fn (frame, THIS, options);
+        }
+
+out:
+        if (ret) {
+                cli_cmd_sent_status_get (&sent);
+                if ((sent == 0) && (parse_error = 0))
+                        cli_out ("Volume clear-locks failed");
+        }
+
+        CLI_STACK_DESTROY (frame);
+
+        return ret;
+}
 
 struct cli_cmd volume_cmds[] = {
         { "volume info [all|<VOLNAME>]",
           cli_cmd_volume_info_cbk,
           "list information of all volumes"},
 
-        { "volume create <NEW-VOLNAME> [stripe <COUNT>] [replica <COUNT>] [transport <tcp|rdma|tcp,rdma>] <NEW-BRICK> ...",
+        { "volume create <NEW-VOLNAME> [stripe <COUNT>] [replica <COUNT>] "
+          "[transport <tcp|rdma|tcp,rdma>] <NEW-BRICK>"
+#ifdef HAVE_BD_XLATOR
+          "?<vg_name>"
+#endif
+          "... [force]",
+
           cli_cmd_volume_create_cbk,
           "create a new volume of specified type with mentioned bricks"},
 
@@ -1601,19 +1854,19 @@ struct cli_cmd volume_cmds[] = {
           cli_cmd_volume_rename_cbk,
           "rename volume <VOLNAME> to <NEW-VOLNAME>"},*/
 
-        { "volume add-brick <VOLNAME> <NEW-BRICK> ...",
+        { "volume add-brick <VOLNAME> [<stripe|replica> <COUNT>] <NEW-BRICK> ... [force]",
           cli_cmd_volume_add_brick_cbk,
           "add brick to volume <VOLNAME>"},
 
-        { "volume remove-brick <VOLNAME> <BRICK> ... {start|pause|abort|status|commit|force}",
+        { "volume remove-brick <VOLNAME> [replica <COUNT>] <BRICK> ... [start|stop|status|commit|force]",
           cli_cmd_volume_remove_brick_cbk,
           "remove brick from volume <VOLNAME>"},
 
-        { "volume rebalance <VOLNAME> [fix-layout|migrate-data] {start|stop|status} [force]",
+        { "volume rebalance <VOLNAME> [fix-layout] {start|stop|status} [force]",
           cli_cmd_volume_defrag_cbk,
           "rebalance operations"},
 
-        { "volume replace-brick <VOLNAME> <BRICK> <NEW-BRICK> {start|pause|abort|status|commit}",
+        { "volume replace-brick <VOLNAME> <BRICK> <NEW-BRICK> {start [force]|pause|abort|status|commit [force]}",
           cli_cmd_volume_replace_brick_cbk,
           "replace-brick operations"},
 
@@ -1629,14 +1882,6 @@ struct cli_cmd volume_cmds[] = {
           cli_cmd_volume_help_cbk,
           "display help for the volume command"},
 
-        { "volume log filename <VOLNAME> [BRICK] <PATH>",
-          cli_cmd_log_filename_cbk,
-         "set the log file for corresponding volume/brick"},
-
-        { "volume log locate <VOLNAME> [BRICK]",
-          cli_cmd_log_locate_cbk,
-         "locate the log file for corresponding volume/brick"},
-
         { "volume log rotate <VOLNAME> [BRICK]",
           cli_cmd_log_rotate_cbk,
          "rotate the log file for corresponding volume/brick"},
@@ -1650,13 +1895,14 @@ struct cli_cmd volume_cmds[] = {
          "reset all the reconfigured options"},
 
 #if (SYNCDAEMON_COMPILE)
-        {"volume "GEOREP" [<VOLNAME>] [<SLAVE-URL>] {start|stop|config|status} [options...]",
+        {"volume "GEOREP" [<VOLNAME>] [<SLAVE-URL>] {create [push-pem] [force]"
+         "|start [force]|stop [force]|config|status [detail]|delete} [options...]",
          cli_cmd_volume_gsync_set_cbk,
          "Geo-sync operations",
          cli_cmd_check_gsync_exists_cbk},
 #endif
 
-         { "volume profile <VOLNAME> {start|info|stop}",
+         { "volume profile <VOLNAME> {start|stop|info [nfs]}",
            cli_cmd_volume_profile_cbk,
            "volume profile operations"},
 
@@ -1664,28 +1910,35 @@ struct cli_cmd volume_cmds[] = {
           cli_cmd_quota_cbk,
           "quota translator specific operations"},
 
-         { "volume top <VOLNAME> {[open|read|write|opendir|readdir] "
-           "|[read-perf|write-perf bs <size> count <count>]} "
-           " [brick <brick>] [list-cnt <count>]",
+         { "volume top <VOLNAME> {open|read|write|opendir|readdir|clear} [nfs|brick <brick>] [list-cnt <value>] |\n"
+           "volume top <VOLNAME> {read-perf|write-perf} [bs <size> count <count>] [brick <brick>] [list-cnt <value>]",
            cli_cmd_volume_top_cbk,
            "volume top operations"},
 
-        {"volume log level <VOLNAME> <XLATOR[*]> <LOGLEVEL>",
-         cli_cmd_log_level_cbk,
-         "log level for translator"},
-
-        { "volume status <VOLNAME>",
+        { "volume status [all | <VOLNAME> [nfs|shd|<BRICK>]]"
+          " [detail|clients|mem|inode|fd|callpool|tasks]",
           cli_cmd_volume_status_cbk,
-         "display status of specified volume"},
+          "display status of all or specified volume(s)/brick"},
 
-        { "volume heal <VOLNAME>",
+        { "volume heal <VOLNAME> [{full | statistics {heal-count {replica <hostname:brickname>}} |info {healed | heal-failed | split-brain}}]",
           cli_cmd_volume_heal_cbk,
-          "Start healing of volume specified by <VOLNAME>"},
+          "self-heal commands on volume specified by <VOLNAME>"},
 
-        {"volume statedump <VOLNAME> [all|mem|iobuf|callpool|priv|fd|inode]...",
+        {"volume statedump <VOLNAME> [nfs] [all|mem|iobuf|callpool|priv|fd|"
+         "inode|history]...",
          cli_cmd_volume_statedump_cbk,
          "perform statedump on bricks"},
 
+        {"volume list",
+         cli_cmd_volume_list_cbk,
+         "list all volumes in cluster"},
+
+        {"volume clear-locks <VOLNAME> <path> kind {blocked|granted|all}"
+          "{inode [range]|entry [basename]|posix [range]}",
+          cli_cmd_volume_clearlocks_cbk,
+          "Clear locks held on path"
+        },
+
         { NULL, NULL, NULL }
 };
 
diff --git a/cli/src/cli-cmd.c b/cli/src/cli-cmd.c
index 17869eb61..b81f75b5b 100644
--- a/cli/src/cli-cmd.c
+++ b/cli/src/cli-cmd.c
@@ -1,22 +1,12 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -60,6 +50,9 @@ cli_cmd_needs_connection (struct cli_cmd_word *word)
         if (!strcasecmp ("getwd", word->word))
                 return 1;
 
+        if (!strcasecmp ("exit", word->word))
+                return 0;
+
         return CLI_DEFAULT_CONN_TIMEOUT;
 }
 
@@ -208,8 +201,7 @@ cli_cmd_process_line (struct cli_state *state, const char *text)
 
         ret = cli_cmd_process (state, count, tokens);
 out:
-        if (copy)
-                free (copy);
+        free (copy);
 
         if (tokens)
                 cli_cmd_tokens_destroy (tokens);
@@ -239,6 +231,9 @@ cli_cmds_register (struct cli_state *state)
         if (ret)
                 goto out;
 
+        ret = cli_cmd_snapshot_register (state);
+        if (ret)
+                goto out;
 out:
         return ret;
 }
@@ -297,8 +292,6 @@ cli_cmd_await_response (unsigned time)
 
         cmd_done = 0;
 
-        cli_cmd_unlock ();
-
         if (ret)
                 return ret;
 
@@ -370,8 +363,11 @@ cli_cmd_submit (void *req, call_frame_t *frame,
         int             ret = -1;
         unsigned        timeout = 0;
 
-        timeout = (GLUSTER_CLI_PROFILE_VOLUME == procnum) ?
-                   CLI_TOP_CMD_TIMEOUT : CLI_DEFAULT_CMD_TIMEOUT;
+        if ((GLUSTER_CLI_PROFILE_VOLUME == procnum) ||
+            (GLUSTER_CLI_HEAL_VOLUME == procnum))
+                timeout = CLI_TEN_MINUTES_TIMEOUT;
+        else
+                timeout = CLI_DEFAULT_CMD_TIMEOUT;
 
         cli_cmd_lock ();
         cmd_sent = 0;
@@ -381,8 +377,9 @@ cli_cmd_submit (void *req, call_frame_t *frame,
         if (!ret) {
                 cmd_sent = 1;
                 ret = cli_cmd_await_response (timeout);
-        } else
-                cli_cmd_unlock ();
+        }
+
+        cli_cmd_unlock ();
 
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
         return ret;
diff --git a/cli/src/cli-cmd.h b/cli/src/cli-cmd.h
index 82e0ff113..041729276 100644
--- a/cli/src/cli-cmd.h
+++ b/cli/src/cli-cmd.h
@@ -1,22 +1,12 @@
 /*
-   Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com>
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
    This file is part of GlusterFS.
 
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
 */
-
 #ifndef __CLI_CMD_H__
 #define __CLI_CMD_H__
 
@@ -30,6 +20,32 @@
 #include "cli.h"
 #include "list.h"
 
+#define CLI_LOCAL_INIT(local, words, frame, dictionary) \
+        do {                                                 \
+                local = cli_local_get ();                    \
+                                                             \
+                if (local) {                                 \
+                        local->words = words;                \
+                        if (dictionary)                      \
+                                local->dict = dictionary;    \
+                        if (frame)                           \
+                                frame->local = local;        \
+                }                                            \
+        } while (0)
+
+#define CLI_STACK_DESTROY(_frame)                                       \
+        do {                                                            \
+                if (_frame) {                                           \
+                        if (_frame->local) {                            \
+                                gf_log ("cli", GF_LOG_DEBUG, "frame->local " \
+                                        "is not NULL (%p)", _frame->local); \
+                                cli_local_wipe (_frame->local);         \
+                                _frame->local = NULL;                   \
+                        }                                               \
+                        STACK_DESTROY (_frame->root);                   \
+                }                                                       \
+        } while (0);
+
 typedef enum {
         GF_ANSWER_YES = 1,
         GF_ANSWER_NO  = 2
@@ -77,6 +93,8 @@ int cli_cmd_probe_register (struct cli_state *state);
 
 int cli_cmd_system_register (struct cli_state *state);
 
+int cli_cmd_snapshot_register (struct cli_state *state);
+
 int cli_cmd_misc_register (struct cli_state *state);
 
 struct cli_cmd_word *cli_cmd_nextword (struct cli_cmd_word *word,
@@ -102,4 +120,5 @@ cli_cmd_submit (void *req, call_frame_t *frame,
 gf_answer_t
 cli_cmd_get_confirmation (struct cli_state *state, const char *question);
 int cli_cmd_sent_status_get (int *status);
+
 #endif /* __CLI_CMD_H__ */
diff --git a/cli/src/cli-mem-types.h b/cli/src/cli-mem-types.h
index 3c49d2183..09fcb639b 100644
--- a/cli/src/cli-mem-types.h
+++ b/cli/src/cli-mem-types.h
@@ -1,22 +1,12 @@
 /*
-   Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
    This file is part of GlusterFS.
 
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
 */
-
 #ifndef __CLI_MEM_TYPES_H__
 #define __CLI_MEM_TYPES_H__
 
diff --git a/cli/src/cli-rl.c b/cli/src/cli-rl.c
index 80b14620f..ade1c8ebb 100644
--- a/cli/src/cli-rl.c
+++ b/cli/src/cli-rl.c
@@ -1,22 +1,12 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -48,7 +38,6 @@ cli_rl_out (struct cli_state *state, const char *fmt, va_list ap)
 {
         int tmp_rl_point = rl_point;
         int            n = rl_end;
-        int            i = 0;
         int            ret = 0;
 
         if (rl_end >= 0 ) {
@@ -56,12 +45,7 @@ cli_rl_out (struct cli_state *state, const char *fmt, va_list ap)
                 rl_redisplay ();
         }
 
-        printf ("\r");
-
-        for (i = 0; i <= strlen (state->prompt); i++)
-                printf (" ");
-
-        printf ("\r");
+        printf ("\r%*s\r", (int)strlen (state->prompt), "");
 
         ret = vprintf (fmt, ap);
 
@@ -77,6 +61,34 @@ cli_rl_out (struct cli_state *state, const char *fmt, va_list ap)
         return ret;
 }
 
+int
+cli_rl_err (struct cli_state *state, const char *fmt, va_list ap)
+{
+        int tmp_rl_point = rl_point;
+        int            n = rl_end;
+        int            ret = 0;
+
+        if (rl_end >= 0 ) {
+                rl_kill_text (0, rl_end);
+                rl_redisplay ();
+        }
+
+        fprintf (stderr, "\r%*s\r", (int)strlen (state->prompt), "");
+
+        ret = vfprintf (stderr, fmt, ap);
+
+        fprintf (stderr, "\n");
+        fflush(stderr);
+
+        if (n) {
+                rl_do_undo ();
+                rl_point = tmp_rl_point;
+                rl_reset_line_state ();
+        }
+
+        return ret;
+}
+
 
 void
 cli_rl_process_line (char *line)
@@ -89,9 +101,14 @@ cli_rl_process_line (char *line)
         state->rl_processing = 1;
         {
                 ret = cli_cmd_process_line (state, line);
+                if (ret)
+                        gf_log (THIS->name, GF_LOG_WARNING,
+                                "failed to process line");
+
                 add_history (line);
         }
         state->rl_processing = 0;
+
 }
 
 
@@ -187,7 +204,7 @@ cli_rl_tokenize (const char *text)
         }
 
         if (i < count) {
-                /* symoblize that what needs to be autocompleted is
+                /* symbolize that what needs to be autocompleted is
                    the full set of possible nextwords, and not extend
                    the last word
                 */
@@ -199,8 +216,7 @@ cli_rl_tokenize (const char *text)
         }
 
 out:
-        if (copy)
-                free (copy);
+        free (copy);
 
         if (i < count) {
                 cli_cmd_tokens_destroy (tokens);
@@ -347,9 +363,10 @@ cli_rl_input (void *_data)
         for (;;) {
                 line = readline (state->prompt);
                 if (!line)
-                        break;
+                        exit(0);  //break;
 
-                cli_rl_process_line (line);
+                if (*line)
+                        cli_rl_process_line (line);
 
                 free (line);
         }
diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c
index 1e77ae0ab..bfeb854ad 100644
--- a/cli/src/cli-rpc-ops.c
+++ b/cli/src/cli-rpc-ops.c
@@ -1,32 +1,28 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 
 #ifndef _CONFIG_H
 #define _CONFIG_H
 #include "config.h"
 #endif
 
-#ifndef GSYNC_CONF
-#define GSYNC_CONF GEOREP"/gsyncd.conf"
-#endif
-#define DEFAULT_LOG_FILE_DIRECTORY      DATADIR "/log/glusterfs"
+/* Widths of various columns in top read/write-perf output
+ * Total width of top read/write-perf should be 80 chars
+ * including one space between column
+ */
+#define VOL_TOP_PERF_FILENAME_DEF_WIDTH 47
+#define VOL_TOP_PERF_FILENAME_ALT_WIDTH 44
+#define VOL_TOP_PERF_SPEED_WIDTH        4
+#define VOL_TOP_PERF_TIME_WIDTH         26
+
+#define INDENT_MAIN_HEAD "%-25s %s "
 
 #include "cli.h"
 #include "compat-errno.h"
@@ -50,25 +46,39 @@ extern rpc_clnt_prog_t *cli_rpc_prog;
 extern int              cli_op_ret;
 extern int              connected;
 
-char *cli_volume_type[] = {"Distribute",
-                           "Stripe",
-                           "Replicate",
-                           "Striped-Replicate (RAID 01)",
-                           "Distributed-Stripe",
-                           "Distributed-Replicate",
-                           "Distributed-Striped-Replicate (RAID 01)",
-};
-
-
-char *cli_volume_status[] = {"Created",
-                             "Started",
-                             "Stopped"
+char *cli_vol_type_str[] = {"Distribute",
+                            "Stripe",
+                            "Replicate",
+                            "Striped-Replicate",
+                            "Distributed-Stripe",
+                            "Distributed-Replicate",
+                            "Distributed-Striped-Replicate",
+                           };
+
+char *cli_vol_status_str[] = {"Created",
+                              "Started",
+                              "Stopped",
+                             };
+
+char *cli_vol_task_status_str[] = {"not started",
+                                   "in progress",
+                                   "stopped",
+                                   "completed",
+                                   "failed",
+                                   "fix-layout in progress",
+                                   "fix-layout stopped",
+                                   "fix-layout completed",
+                                   "fix-layout failed",
 };
 
 int32_t
-gf_cli3_1_get_volume (call_frame_t *frame, xlator_t *this,
+gf_cli_get_volume (call_frame_t *frame, xlator_t *this,
                       void *data);
 
+int
+cli_to_glusterd (gf_cli_req *req, call_frame_t *frame, fop_cbk_fn_t cbkfn,
+                 xdrproc_t xdrproc, dict_t *dict, int procnum, xlator_t *this,
+                 rpc_clnt_prog_t *prog, struct iobref *iobref);
 
 rpc_clnt_prog_t cli_handshake_prog = {
         .progname  = "cli handshake",
@@ -82,76 +92,51 @@ rpc_clnt_prog_t cli_pmap_prog = {
         .progver    = GLUSTER_PMAP_VERSION,
 };
 
-
 int
-gf_cli3_1_probe_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_probe_cbk (struct rpc_req *req, struct iovec *iov,
                         int count, void *myframe)
 {
-        gf1_cli_probe_rsp    rsp   = {0,};
-        int                   ret   = 0;
+        gf_cli_rsp            rsp   = {0,};
+        int                   ret   = -1;
+        char                  msg[1024] = {0,};
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_probe_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 //rsp.op_ret   = -1;
                 //rsp.op_errno = EINVAL;
                 goto out;
         }
 
         gf_log ("cli", GF_LOG_INFO, "Received resp to probe");
-	 if (!rsp.op_ret) {
-	 	switch (rsp.op_errno) {
-		 	case GF_PROBE_SUCCESS:
-		      		cli_out ("Probe successful");
-		      		break;
-	 	 	case GF_PROBE_LOCALHOST:
-		      		cli_out ("Probe on localhost not needed");
-		      		break;
-			case GF_PROBE_FRIEND:
-				cli_out ("Probe on host %s port %d already"
-					 " in peer list", rsp.hostname, rsp.port);
-				break;
-		 	default:
-		      		cli_out ("Probe returned with unknown errno %d",
-					rsp.op_errno);
-		      		break;
-	 	}
-	 }
 
-        if (rsp.op_ret) {
-                switch (rsp.op_errno) {
-                        case GF_PROBE_ANOTHER_CLUSTER:
-                                cli_out ("%s is already part of "
-                                         "another cluster", rsp.hostname);
-                                break;
-                        case GF_PROBE_VOLUME_CONFLICT:
-                                cli_out ("Atleast one volume on %s conflicts "
-                                         "with existing volumes in the "
-                                         "cluster", rsp.hostname);
-                                break;
-                        case GF_PROBE_UNKNOWN_PEER:
-                                cli_out ("%s responded with 'unknown peer' error, "
-                                         "this could happen if %s doesn't have"
-                                         " localhost in its peer database",
-                                         rsp.hostname, rsp.hostname);
-                                break;
-                        case GF_PROBE_ADD_FAILED:
-                                cli_out ("Failed to add peer information "
-                                         "on %s" , rsp.hostname);
-                                break;
+        if (rsp.op_errstr && (strlen (rsp.op_errstr) > 0)) {
+                snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
+                if (rsp.op_ret)
+                        gf_log ("cli", GF_LOG_ERROR, "%s", msg);
+        }
 
-                        default:
-                                cli_out ("Probe unsuccessful\nProbe returned "
-                                         "with unknown errno %d", rsp.op_errno);
-                                break;
-                }
-                gf_log ("glusterd",GF_LOG_ERROR,"Probe failed with op_ret %d"
-                        " and op_errno %d", rsp.op_ret, rsp.op_errno);
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_str (NULL,
+                                          (rsp.op_ret)? NULL : msg,
+                                          rsp.op_ret, rsp.op_errno,
+                                          (rsp.op_ret)? msg : NULL);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
         }
+
+        if (!rsp.op_ret)
+                cli_out ("peer probe: success. %s", msg);
+        else
+                cli_err ("peer probe: failed: %s", msg);
+
         ret = rsp.op_ret;
 
 out:
@@ -160,51 +145,52 @@ out:
 }
 
 int
-gf_cli3_1_deprobe_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_deprobe_cbk (struct rpc_req *req, struct iovec *iov,
                        int count, void *myframe)
 {
-        gf1_cli_deprobe_rsp    rsp   = {0,};
-        int                   ret   = 0;
+        gf_cli_rsp            rsp   = {0,};
+        int                   ret   = -1;
+        char              msg[1024] = {0,};
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_deprobe_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 //rsp.op_ret   = -1;
                 //rsp.op_errno = EINVAL;
                 goto out;
         }
 
         gf_log ("cli", GF_LOG_INFO, "Received resp to deprobe");
+
         if (rsp.op_ret) {
-                switch (rsp.op_errno) {
-                        case GF_DEPROBE_LOCALHOST:
-                                cli_out ("%s is localhost",
-                                         rsp.hostname);
-                                break;
-                        case GF_DEPROBE_NOT_FRIEND:
-                                cli_out ("%s is not part of cluster",
-                                         rsp.hostname);
-                                break;
-                        case GF_DEPROBE_BRICK_EXIST:
-                                cli_out ("Brick(s) with the peer %s exist in "
-                                         "cluster", rsp.hostname);
-                                break;
-                        default:
-                                cli_out ("Detach unsuccessful\nDetach returned "
-                                         "with unknown errno %d",
-                                         rsp.op_errno);
-                                break;
+                if (strlen (rsp.op_errstr) > 0) {
+                        snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
+                        gf_log ("cli", GF_LOG_ERROR, "%s", rsp.op_errstr);
                 }
-                gf_log ("glusterd",GF_LOG_ERROR,"Detach failed with op_ret %d"
-                        " and op_errno %d", rsp.op_ret, rsp.op_errno);
         } else {
-                cli_out ("Detach successful");
+                snprintf (msg, sizeof (msg), "success");
         }
 
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_str (NULL,
+                                          (rsp.op_ret)? NULL : msg,
+                                          rsp.op_ret, rsp.op_errno,
+                                          (rsp.op_ret)? msg : NULL);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (!rsp.op_ret)
+                cli_out ("peer detach: %s", msg);
+        else
+                cli_err ("peer detach: failed: %s", msg);
 
         ret = rsp.op_ret;
 
@@ -214,35 +200,143 @@ out:
 }
 
 int
-gf_cli3_1_list_friends_cbk (struct rpc_req *req, struct iovec *iov,
-                             int count, void *myframe)
+gf_cli_output_peer_status (dict_t *dict, int count)
 {
-        gf1_cli_peer_list_rsp      rsp   = {0,};
-        int                        ret   = 0;
-        dict_t                     *dict = NULL;
+        int                        ret   = -1;
         char                       *uuid_buf = NULL;
         char                       *hostname_buf = NULL;
         int32_t                    i = 1;
         char                       key[256] = {0,};
         char                       *state = NULL;
-        int32_t                    port = 0;
         int32_t                    connected = 0;
         char                       *connected_str = NULL;
 
+        cli_out ("Number of Peers: %d", count);
+        i = 1;
+        while ( i <= count) {
+                snprintf (key, 256, "friend%d.uuid", i);
+                ret = dict_get_str (dict, key, &uuid_buf);
+                if (ret)
+                        goto out;
+
+                snprintf (key, 256, "friend%d.hostname", i);
+                ret = dict_get_str (dict, key, &hostname_buf);
+                if (ret)
+                        goto out;
+
+                snprintf (key, 256, "friend%d.connected", i);
+                ret = dict_get_int32 (dict, key, &connected);
+                if (ret)
+                        goto out;
+                if (connected)
+                        connected_str = "Connected";
+                else
+                        connected_str = "Disconnected";
+
+
+                snprintf (key, 256, "friend%d.state", i);
+                ret = dict_get_str (dict, key, &state);
+                if (ret)
+                        goto out;
+
+                cli_out ("\nHostname: %s\nUuid: %s\nState: %s (%s)",
+                         hostname_buf, uuid_buf, state, connected_str);
+                i++;
+        }
+
+        ret = 0;
+out:
+        return ret;
+}
+
+int
+gf_cli_output_pool_list (dict_t *dict, int count)
+{
+        int                        ret   = -1;
+        char                       *uuid_buf = NULL;
+        char                       *hostname_buf = NULL;
+        int32_t                    i = 1;
+        char                       key[256] = {0,};
+        int32_t                    connected = 0;
+        char                       *connected_str = NULL;
+
+        if (count >= 1)
+                cli_out ("UUID\t\t\t\t\tHostname\tState");
+
+        while ( i <= count) {
+                snprintf (key, 256, "friend%d.uuid", i);
+                ret = dict_get_str (dict, key, &uuid_buf);
+                if (ret)
+                        goto out;
+
+                snprintf (key, 256, "friend%d.hostname", i);
+                ret = dict_get_str (dict, key, &hostname_buf);
+                if (ret)
+                        goto out;
+
+                snprintf (key, 256, "friend%d.connected", i);
+                ret = dict_get_int32 (dict, key, &connected);
+                if (ret)
+                        goto out;
+                if (connected)
+                        connected_str = "Connected";
+                else
+                        connected_str = "Disconnected";
+
+                cli_out ("%s\t%-9s\t%s ", uuid_buf, hostname_buf,
+                         connected_str);
+                i++;
+        }
+
+        ret = 0;
+out:
+        return ret;
+}
+
+/* function pointer for gf_cli_output_{pool_list,peer_status} */
+typedef int (*cli_friend_output_fn) (dict_t*, int);
+
+int
+gf_cli_list_friends_cbk (struct rpc_req *req, struct iovec *iov,
+                             int count, void *myframe)
+{
+        gf1_cli_peer_list_rsp      rsp   = {0,};
+        int                        ret   = -1;
+        dict_t                    *dict = NULL;
+        char                       msg[1024] = {0,};
+        char                      *cmd = NULL;
+        cli_friend_output_fn       friend_output_fn;
+        call_frame_t              *frame = NULL;
+        unsigned long              flags = 0;
+
+        frame = myframe;
+        flags = (long)frame->local;
+
+        if (flags == GF_CLI_LIST_POOL_NODES) {
+                cmd = "pool list";
+                friend_output_fn = &gf_cli_output_pool_list;
+        } else {
+                cmd = "peer status";
+                friend_output_fn = &gf_cli_output_peer_status;
+        }
+
+        /* 'free' the flags set by gf_cli_list_friends */
+        frame->local = NULL;
+
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
         ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_peer_list_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 //rsp.op_ret   = -1;
                 //rsp.op_errno = EINVAL;
                 goto out;
         }
 
-
-        gf_log ("cli", GF_LOG_INFO, "Received resp to list: %d",
+        gf_log ("cli", GF_LOG_DEBUG, "Received resp to list: %d",
                 rsp.op_ret);
 
         ret = rsp.op_ret;
@@ -250,7 +344,19 @@ gf_cli3_1_list_friends_cbk (struct rpc_req *req, struct iovec *iov,
         if (!rsp.op_ret) {
 
                 if (!rsp.friends.friends_len) {
-                        cli_out ("No peers present");
+                        snprintf (msg, sizeof (msg),
+                                  "%s: No peers present", cmd);
+                        if (global_state->mode & GLUSTER_MODE_XML) {
+                                ret = cli_xml_output_peer_status (dict,
+                                                                  rsp.op_ret,
+                                                                  rsp.op_errno,
+                                                                  msg);
+                                if (ret)
+                                        gf_log ("cli", GF_LOG_ERROR,
+                                                "Error outputting to xml");
+                                goto out;
+                        }
+                        cli_err ("%s", msg);
                         ret = 0;
                         goto out;
                 }
@@ -272,58 +378,34 @@ gf_cli3_1_list_friends_cbk (struct rpc_req *req, struct iovec *iov,
                         goto out;
                 }
 
-                ret = dict_get_int32 (dict, "count", &count);
+                if (global_state->mode & GLUSTER_MODE_XML) {
+                        ret = cli_xml_output_peer_status (dict, rsp.op_ret,
+                                                          rsp.op_errno, msg);
+                        if (ret)
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Error outputting to xml");
+                        goto out;
+                }
 
+                ret = dict_get_int32 (dict, "count", &count);
                 if (ret) {
                         goto out;
                 }
 
-                cli_out ("Number of Peers: %d", count);
-
-                while ( i <= count) {
-                        snprintf (key, 256, "friend%d.uuid", i);
-                        ret = dict_get_str (dict, key, &uuid_buf);
-                        if (ret)
-                                goto out;
-
-                        snprintf (key, 256, "friend%d.hostname", i);
-                        ret = dict_get_str (dict, key, &hostname_buf);
-                        if (ret)
-                                goto out;
-
-                        snprintf (key, 256, "friend%d.connected", i);
-                        ret = dict_get_int32 (dict, key, &connected);
-                        if (ret)
-                                goto out;
-                        if (connected)
-                                connected_str = "Connected";
-                        else
-                                connected_str = "Disconnected";
-
-                        snprintf (key, 256, "friend%d.port", i);
-                        ret = dict_get_int32 (dict, key, &port);
-                        if (ret)
-                                goto out;
-
-                        snprintf (key, 256, "friend%d.state", i);
-                        ret = dict_get_str (dict, key, &state);
-                        if (ret)
-                                goto out;
-
-                        if (!port) {
-                                cli_out ("\nHostname: %s\nUuid: %s\nState: %s "
-                                         "(%s)",
-                                         hostname_buf, uuid_buf, state,
-                                         connected_str);
-                        } else {
-                                cli_out ("\nHostname: %s\nPort: %d\nUuid: %s\n"
-                                         "State: %s (%s)", hostname_buf, port,
-                                         uuid_buf, state, connected_str);
-                        }
-                        i++;
+                ret = friend_output_fn (dict, count);
+                if (ret) {
+                        goto out;
                 }
         } else {
-                ret = -1;
+                if (global_state->mode & GLUSTER_MODE_XML) {
+                        ret = cli_xml_output_peer_status (dict, rsp.op_ret,
+                                                          rsp.op_errno, NULL);
+                        if (ret)
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Error outputting to xml");
+                } else {
+                        ret = -1;
+                }
                 goto out;
         }
 
@@ -333,7 +415,7 @@ gf_cli3_1_list_friends_cbk (struct rpc_req *req, struct iovec *iov,
 out:
         cli_cmd_broadcast_response (ret);
         if (ret)
-                cli_out ("Peer status unsuccessful");
+                cli_err ("%s: failed", cmd);
 
         if (dict)
                 dict_destroy (dict);
@@ -367,596 +449,1067 @@ cli_out_options ( char *substr, char *optstr, char *valstr)
         cli_out ("%s: %s",ptr2 , valstr);
 }
 
+static int
+_gf_cli_output_volinfo_opts (dict_t *d, char *k,
+                             data_t *v, void *tmp)
+{
+        int     ret   = 0;
+        char   *key   = NULL;
+        char   *ptr   = NULL;
+        data_t *value = NULL;
+
+        key = tmp;
+
+        ptr = strstr (k, "option.");
+        if (ptr) {
+                value = v;
+                if (!value) {
+                        ret = -1;
+                        goto out;
+                }
+                cli_out_options (key, k, v->data);
+        }
+out:
+        return ret;
+}
+
 
 int
-gf_cli3_1_get_volume_cbk (struct rpc_req *req, struct iovec *iov,
-                             int count, void *myframe)
+gf_cli_get_volume_cbk (struct rpc_req *req, struct iovec *iov,
+                          int count, void *myframe)
 {
-        gf1_cli_get_vol_rsp        rsp   = {0,};
-        int                        ret   = 0;
-        dict_t                     *dict = NULL;
-        char                       *volname = NULL;
-        int32_t                    i = 0;
-        char                       key[1024] = {0,};
-        int32_t                    status = 0;
-        int32_t                    type = 0;
-        int32_t                    brick_count = 0;
-        int32_t                    dist_count = 0;
-        int32_t                    stripe_count = 0;
-        int32_t                    replica_count = 0;
-        int32_t                    vol_type = 0;
-        char                       *brick = NULL;
-        int32_t                    j = 1;
-        cli_local_t                *local = NULL;
-        int32_t                    transport = 0;
-        data_pair_t                *pairs = NULL;
-        char                       *ptr = NULL;
-        data_t                     *value = NULL;
-        int                        opt_count = 0;
-        int                        k = 0;
-        char                       err_str[2048] = {0};
-
-        snprintf (err_str, sizeof (err_str), "Volume info unsuccessful");
-        if (-1 == req->rpc_status) {
+        int                        ret                  = -1;
+        int                        opt_count            = 0;
+        int32_t                    i                    = 0;
+        int32_t                    j                    = 1;
+        int32_t                    status               = 0;
+        int32_t                    type                 = 0;
+        int32_t                    brick_count          = 0;
+        int32_t                    dist_count           = 0;
+        int32_t                    stripe_count         = 0;
+        int32_t                    replica_count        = 0;
+        int32_t                    vol_type             = 0;
+        int32_t                    transport            = 0;
+        char                      *volume_id_str        = NULL;
+        char                      *brick                = NULL;
+        char                      *volname              = NULL;
+        dict_t                    *dict                 = NULL;
+        cli_local_t               *local                = NULL;
+        char                       key[1024]            = {0};
+        char                       err_str[2048]        = {0};
+        gf_cli_rsp                 rsp                  = {0};
+        char                      *caps                 = NULL;
+        int                        k __attribute__((unused)) = 0;
+        // snap_volume variable helps in showing whether a volume is a normal
+        //volume or a volume for the snapshot
+        int32_t                    snap_volume          = 0;
+
+        if (-1 == req->rpc_status)
                 goto out;
-        }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_get_vol_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
-                //rsp.op_ret   = -1;
-                //rsp.op_errno = EINVAL;
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
-
         gf_log ("cli", GF_LOG_INFO, "Received resp to get vol: %d",
                 rsp.op_ret);
 
-        if (!rsp.op_ret) {
+        if (rsp.op_ret) {
+                ret = -1;
+                goto out;
+        }
 
-                if (!rsp.volumes.volumes_len) {
-                        cli_out ("No volumes present");
-                        ret = 0;
-                        goto out;
-                }
+        if (!rsp.dict.dict_len) {
+                if (global_state->mode & GLUSTER_MODE_XML)
+                        goto xml_output;
+                cli_err ("No volumes present");
+                ret = 0;
+                goto out;
+        }
 
-                dict = dict_new ();
+        dict = dict_new ();
 
-                if (!dict) {
-                        ret = -1;
-                        goto out;
-                }
+        if (!dict) {
+                ret = -1;
+                goto out;
+        }
 
-                ret = dict_unserialize (rsp.volumes.volumes_val,
-                                        rsp.volumes.volumes_len,
-                                        &dict);
+        ret = dict_unserialize (rsp.dict.dict_val,
+                                rsp.dict.dict_len,
+                                &dict);
 
-                if (ret) {
-                        gf_log ("", GF_LOG_ERROR,
-                                        "Unable to allocate memory");
-                        goto out;
-                }
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR,
+                        "Unable to allocate memory");
+                goto out;
+        }
 
-                ret = dict_get_int32 (dict, "count", &count);
+        ret = dict_get_int32 (dict, "count", &count);
+        if (ret)
+                goto out;
 
-                if (ret) {
-                        goto out;
-                }
+        local = ((call_frame_t *)myframe)->local;
 
-                local = ((call_frame_t *)myframe)->local;
-                //cli_out ("Number of Volumes: %d", count);
+        if (!count) {
+                switch (local->get_vol.flags) {
 
-                if (!count && (local->u.get_vol.flags ==
-                                        GF_CLI_GET_NEXT_VOLUME)) {
-                        local->u.get_vol.volname = NULL;
+                case GF_CLI_GET_NEXT_VOLUME:
+                        GF_FREE (local->get_vol.volname);
+                        local->get_vol.volname = NULL;
                         ret = 0;
                         goto out;
-                } else if (!count && (local->u.get_vol.flags ==
-                                        GF_CLI_GET_VOLUME)) {
+
+                case GF_CLI_GET_VOLUME:
+                        memset (err_str, 0, sizeof (err_str));
                         snprintf (err_str, sizeof (err_str),
                                   "Volume %s does not exist",
-                                  local->u.get_vol.volname);
+                                  local->get_vol.volname);
                         ret = -1;
-                        goto out;
+                        if (!(global_state->mode & GLUSTER_MODE_XML))
+                                goto out;
                 }
+        }
 
-                while ( i < count) {
-                        cli_out (" ");
-                        snprintf (key, 256, "volume%d.name", i);
-                        ret = dict_get_str (dict, key, &volname);
-                        if (ret)
+xml_output:
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                /* For GET_NEXT_VOLUME output is already begun in
+                 * and will also end in gf_cli_get_next_volume()
+                 */
+                if (local->get_vol.flags == GF_CLI_GET_VOLUME) {
+                        ret = cli_xml_output_vol_info_begin
+                                (local, rsp.op_ret, rsp.op_errno,
+                                 rsp.op_errstr);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Error outputting to xml");
                                 goto out;
+                        }
+                }
 
-                        snprintf (key, 256, "volume%d.type", i);
-                        ret = dict_get_int32 (dict, key, &type);
-                        if (ret)
+                if (dict) {
+                        ret = cli_xml_output_vol_info (local, dict);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Error outputting to xml");
                                 goto out;
+                        }
+                }
 
-                        snprintf (key, 256, "volume%d.status", i);
-                        ret = dict_get_int32 (dict, key, &status);
+                if (local->get_vol.flags == GF_CLI_GET_VOLUME) {
+                        ret = cli_xml_output_vol_info_end (local);
                         if (ret)
-                                goto out;
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Error outputting to xml");
+                }
+                goto out;
+        }
 
-                        snprintf (key, 256, "volume%d.brick_count", i);
-                        ret = dict_get_int32 (dict, key, &brick_count);
-                        if (ret)
-                                goto out;
+        while ( i < count) {
+                cli_out (" ");
+                snprintf (key, 256, "volume%d.name", i);
+                ret = dict_get_str (dict, key, &volname);
+                if (ret)
+                        goto out;
 
-                        snprintf (key, 256, "volume%d.dist_count", i);
-                        ret = dict_get_int32 (dict, key, &dist_count);
-                        if (ret)
-                                goto out;
+                snprintf (key, 256, "volume%d.type", i);
+                ret = dict_get_int32 (dict, key, &type);
+                if (ret)
+                        goto out;
 
-                        snprintf (key, 256, "volume%d.stripe_count", i);
-                        ret = dict_get_int32 (dict, key, &stripe_count);
-                        if (ret)
-                                goto out;
+                snprintf (key, 256, "volume%d.status", i);
+                ret = dict_get_int32 (dict, key, &status);
+                if (ret)
+                        goto out;
 
-                        snprintf (key, 256, "volume%d.replica_count", i);
-                        ret = dict_get_int32 (dict, key, &replica_count);
-                        if (ret)
-                                goto out;
+                snprintf (key, sizeof (key), "volume%d.snap_volume", i);
+                ret = dict_get_int32 (dict, key, &snap_volume);
+                if (ret)
+                        goto out;
 
-                        snprintf (key, 256, "volume%d.transport", i);
-                        ret = dict_get_int32 (dict, key, &transport);
-                        if (ret)
-                                goto out;
+                snprintf (key, 256, "volume%d.brick_count", i);
+                ret = dict_get_int32 (dict, key, &brick_count);
+                if (ret)
+                        goto out;
 
-                        vol_type = type;
-
-                        // Distributed (stripe/replicate/raid01) setups
-                        if ((type > 0) && ( dist_count < brick_count))
-                                vol_type = type + 3;
-
-                        cli_out ("Volume Name: %s", volname);
-                        cli_out ("Type: %s", cli_volume_type[vol_type]);
-                        cli_out ("Status: %s", cli_volume_status[status]);
-
-                        if (type == GF_CLUSTER_TYPE_STRIPE_REPLICATE)
-                                cli_out ("Number of Bricks: %d x %d x %d = %d",
-                                         (brick_count / dist_count),
-                                         stripe_count,
-                                         replica_count,
-                                         brick_count);
-                        else if (type == GF_CLUSTER_TYPE_NONE)
-                                cli_out ("Number of Bricks: %d",
-                                         brick_count);
-                        else
-                                /* For both replicate and stripe, dist_count is
-                                   good enough */
-                                cli_out ("Number of Bricks: %d x %d = %d",
-                                         (brick_count / dist_count),
-                                         dist_count,
-                                         brick_count);
+                snprintf (key, 256, "volume%d.dist_count", i);
+                ret = dict_get_int32 (dict, key, &dist_count);
+                if (ret)
+                        goto out;
 
+                snprintf (key, 256, "volume%d.stripe_count", i);
+                ret = dict_get_int32 (dict, key, &stripe_count);
+                if (ret)
+                        goto out;
 
-                        cli_out ("Transport-type: %s",
-                                 ((transport == 0)?"tcp":
-                                   (transport == 1)?"rdma":
-                                  "tcp,rdma"));
-                        j = 1;
+                snprintf (key, 256, "volume%d.replica_count", i);
+                ret = dict_get_int32 (dict, key, &replica_count);
+                if (ret)
+                        goto out;
 
+                snprintf (key, 256, "volume%d.transport", i);
+                ret = dict_get_int32 (dict, key, &transport);
+                if (ret)
+                        goto out;
 
-                        GF_FREE (local->u.get_vol.volname);
-                        local->u.get_vol.volname = gf_strdup (volname);
+                snprintf (key, 256, "volume%d.volume_id", i);
+                ret = dict_get_str (dict, key, &volume_id_str);
+                if (ret)
+                        goto out;
 
-                        if (brick_count)
-                                cli_out ("Bricks:");
+                vol_type = type;
 
-                        while ( j <= brick_count) {
-                                snprintf (key, 1024, "volume%d.brick%d",
-                                          i, j);
-                                ret = dict_get_str (dict, key, &brick);
+                // Distributed (stripe/replicate/stripe-replica) setups
+                if ((type > 0) && ( dist_count < brick_count))
+                        vol_type = type + 3;
+
+                cli_out ("Volume Name: %s", volname);
+                cli_out ("Type: %s", cli_vol_type_str[vol_type]);
+                cli_out ("Volume ID: %s", volume_id_str);
+                cli_out ("Status: %s", cli_vol_status_str[status]);
+                if (snap_volume)
+                        cli_out ("Snap Volume: %s", "yes");
+                else
+                        cli_out ("Snap Volume: %s", "no");
+
+#ifdef HAVE_BD_XLATOR
+                k = 0;
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d.xlator%d", i, k);
+                ret = dict_get_str (dict, key, &caps);
+                if (ret)
+                        goto next;
+                do {
+                        j = 0;
+                        cli_out ("Xlator %d: %s", k + 1, caps);
+                        do {
+                                memset (key, 0, sizeof (key));
+                                snprintf (key, sizeof (key),
+                                          "volume%d.xlator%d.caps%d",
+                                          i, k, j++);
+                                ret = dict_get_str (dict, key, &caps);
                                 if (ret)
-                                        goto out;
-                                cli_out ("Brick%d: %s", j, brick);
-                                j++;
-                        }
-                        pairs = dict->members_list;
-                        if (!pairs) {
-                                ret = -1;
-                                goto out;
-                        }
+                                        break;
+                                cli_out ("Capability %d: %s", j, caps);
+                        } while (1);
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key),
+                                  "volume%d.xlator%d", i, ++k);
+                        ret = dict_get_str (dict, key, &caps);
+                        if (ret)
+                                break;
+                } while (1);
+
+next:
+#else
+                caps = 0; /* Avoid compiler warnings when BD not enabled */
+#endif
+
+                if (type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) {
+                        cli_out ("Number of Bricks: %d x %d x %d = %d",
+                                 (brick_count / dist_count),
+                                 stripe_count,
+                                 replica_count,
+                                 brick_count);
+                } else if (type == GF_CLUSTER_TYPE_NONE) {
+                        cli_out ("Number of Bricks: %d", brick_count);
+                } else {
+                        /* For both replicate and stripe, dist_count is
+                           good enough */
+                        cli_out ("Number of Bricks: %d x %d = %d",
+                                 (brick_count / dist_count),
+                                 dist_count, brick_count);
+                }
+
+                cli_out ("Transport-type: %s",
+                         ((transport == 0)?"tcp":
+                          (transport == 1)?"rdma":
+                          "tcp,rdma"));
+                j = 1;
 
-                        snprintf (key, 256, "volume%d.opt_count",i);
-                        ret = dict_get_int32 (dict, key, &opt_count);
+                GF_FREE (local->get_vol.volname);
+                local->get_vol.volname = gf_strdup (volname);
+
+                if (brick_count)
+                        cli_out ("Bricks:");
+
+                while (j <= brick_count) {
+                        snprintf (key, 1024, "volume%d.brick%d", i, j);
+                        ret = dict_get_str (dict, key, &brick);
                         if (ret)
-                            goto out;
-
-                        if (!opt_count)
-                            goto out;
-
-                        cli_out ("Options Reconfigured:");
-                        k = 0;
-                        while ( k < opt_count) {
-
-                                snprintf (key, 256, "volume%d.option.",i);
-                                while (pairs) {
-                                        ptr = strstr (pairs->key, "option.");
-                                        if (ptr) {
-                                                value = pairs->value;
-                                                if (!value) {
-                                                        ret = -1;
-                                                        goto out;
-                                                }
-                                                cli_out_options (key, pairs->key,
-                                                                 value->data);
-                                        }
-                                        pairs = pairs->next;
-                                }
-                                k++;
-                       }
+                                goto out;
 
-                       i++;
+                        cli_out ("Brick%d: %s", j, brick);
+#ifdef HAVE_BD_XLATOR
+                        snprintf (key, 256, "volume%d.vg%d", i, j);
+                        ret = dict_get_str (dict, key, &caps);
+                        if (!ret)
+                                cli_out ("Brick%d VG: %s", j, caps);
+#endif
+                        j++;
                 }
 
+                snprintf (key, 256, "volume%d.opt_count",i);
+                ret = dict_get_int32 (dict, key, &opt_count);
+                if (ret)
+                        goto out;
 
-        } else {
-                ret = -1;
-                goto out;
+                if (!opt_count)
+                        goto out;
+
+                cli_out ("Options Reconfigured:");
+
+                snprintf (key, 256, "volume%d.option.",i);
+
+                ret = dict_foreach (dict, _gf_cli_output_volinfo_opts, key);
+                if (ret)
+                        goto out;
+
+                i++;
         }
 
 
         ret = 0;
-
 out:
         cli_cmd_broadcast_response (ret);
         if (ret)
-                cli_out ("%s", err_str);
+                cli_err ("%s", err_str);
 
         if (dict)
                 dict_destroy (dict);
 
-        gf_log ("", GF_LOG_INFO, "Returning: %d", ret);
+        free (rsp.dict.dict_val);
+
+        free (rsp.op_errstr);
+
+        gf_log ("cli", GF_LOG_INFO, "Returning: %d", ret);
         return ret;
 }
 
 int
-gf_cli3_1_create_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_create_volume_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf1_cli_create_vol_rsp  rsp   = {0,};
-        int                     ret   = 0;
+        gf_cli_rsp              rsp   = {0,};
+        int                     ret   = -1;
         cli_local_t             *local = NULL;
         char                    *volname = NULL;
         dict_t                  *dict = NULL;
+        dict_t                  *rsp_dict = NULL;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
         local = ((call_frame_t *) (myframe))->local;
-        ((call_frame_t *) (myframe))->local = NULL;
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_create_vol_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
-        dict = local->u.create_vol.dict;
+        gf_log ("cli", GF_LOG_INFO, "Received resp to create volume");
+
+        dict = local->dict;
 
         ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                goto out;
 
-        gf_log ("cli", GF_LOG_INFO, "Received resp to create volume");
-	if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
-	        cli_out ("%s", rsp.op_errstr);
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                if (rsp.op_ret == 0) {
+                        rsp_dict = dict_new ();
+                        ret = dict_unserialize (rsp.dict.dict_val,
+                                                rsp.dict.dict_len,
+                                                &rsp_dict);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Failed rsp_dict unserialization");
+                                goto out;
+                        }
+                }
+
+                ret = cli_xml_output_vol_create (rsp_dict, rsp.op_ret,
+                                                 rsp.op_errno, rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
+                cli_err ("volume create: %s: failed: %s", volname,
+                         rsp.op_errstr);
+        else if (rsp.op_ret)
+                cli_err ("volume create: %s: failed", volname);
         else
-                cli_out ("Creation of volume %s has been %s", volname,
-                                (rsp.op_ret) ? "unsuccessful":
-                                "successful. Please start the volume to "
-                                "access data.");
+                cli_out ("volume create: %s: success: "
+                         "please start the volume to access data", volname);
+
         ret = rsp.op_ret;
 
 out:
         cli_cmd_broadcast_response (ret);
-        if (dict)
-                dict_unref (dict);
-        if (local)
-                cli_local_wipe (local);
-        if (rsp.volname)
-                free (rsp.volname);
-        if (rsp.op_errstr)
-                free (rsp.op_errstr);
+        free (rsp.dict.dict_val);
+        free (rsp.op_errstr);
         return ret;
 }
 
 int
-gf_cli3_1_delete_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_delete_volume_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf1_cli_delete_vol_rsp  rsp   = {0,};
-        int                     ret   = 0;
+        gf_cli_rsp              rsp   = {0,};
+        int                     ret   = -1;
         cli_local_t             *local = NULL;
         char                    *volname = NULL;
         call_frame_t            *frame = NULL;
+        dict_t                  *dict = NULL;
+        dict_t                  *rsp_dict = NULL;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_delete_vol_rsp);
+        frame = myframe;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
+                goto out;
+        }
+
+        local = frame->local;
+
+        if (local)
+                dict = local->dict;
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "dict get failed");
+                goto out;
+        }
+
+        gf_log ("cli", GF_LOG_INFO, "Received resp to delete volume");
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                if (rsp.op_ret == 0) {
+                        rsp_dict = dict_new ();
+                        ret = dict_unserialize (rsp.dict.dict_val,
+                                                rsp.dict.dict_len,
+                                                &rsp_dict);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Failed rsp_dict unserialization");
+                                goto out;
+                        }
+                }
+
+                ret = cli_xml_output_generic_volume ("volDelete", rsp_dict,
+                                                     rsp.op_ret, rsp.op_errno,
+                                                     rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
                 goto out;
         }
 
+        if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
+                cli_err ("volume delete: %s: failed: %s", volname,
+                         rsp.op_errstr);
+        else if (rsp.op_ret)
+                cli_err ("volume delete: %s: failed", volname);
+        else
+                cli_out ("volume delete: %s: success", volname);
+
+        ret = rsp.op_ret;
+
+out:
+        cli_cmd_broadcast_response (ret);
+        free (rsp.dict.dict_val);
+
+        gf_log ("", GF_LOG_INFO, "Returning with %d", ret);
+        return ret;
+}
+
+int
+gf_cli3_1_uuid_get_cbk (struct rpc_req *req, struct iovec *iov,
+                        int count, void *myframe)
+{
+        char                    *uuid_str = NULL;
+        gf_cli_rsp              rsp   = {0,};
+        int                     ret   = -1;
+        cli_local_t             *local = NULL;
+        call_frame_t            *frame = NULL;
+        dict_t                  *dict = NULL;
+
+        if (-1 == req->rpc_status)
+                goto out;
+
         frame = myframe;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
+        if (ret < 0) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
+                goto out;
+        }
+
         local = frame->local;
         frame->local = NULL;
 
-        if (local)
-                volname = local->u.delete_vol.volname;
+        gf_log ("cli", GF_LOG_INFO, "Received resp to uuid get");
 
+        dict = dict_new ();
+        if (!dict) {
+                ret = -1;
+                goto out;
+        }
 
-        gf_log ("cli", GF_LOG_INFO, "Received resp to delete volume");
+        ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len,
+                                &dict);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Failed to unserialize "
+                        "response for uuid get");
+                goto out;
+        }
+
+        ret = dict_get_str (dict, "uuid", &uuid_str);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Failed to get uuid "
+                        "from dictionary");
+                goto out;
+        }
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_dict ("uuidGenerate", dict, rsp.op_ret,
+                                           rsp.op_errno, rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (rsp.op_ret) {
+                if (strcmp (rsp.op_errstr, "") == 0)
+                        cli_err ("Get uuid was unsuccessful");
+                else
+                        cli_err ("%s", rsp.op_errstr);
+
+        } else {
+                cli_out ("UUID: %s", uuid_str);
+
+        }
+        ret = rsp.op_ret;
+
+out:
+        cli_cmd_broadcast_response (ret);
+        cli_local_wipe (local);
+        if (rsp.dict.dict_val)
+                free (rsp.dict.dict_val);
+        if (dict)
+                dict_unref (dict);
+
+        gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+        return ret;
+}
+
+int
+gf_cli3_1_uuid_reset_cbk (struct rpc_req *req, struct iovec *iov,
+                             int count, void *myframe)
+{
+        gf_cli_rsp              rsp   = {0,};
+        int                     ret   = -1;
+        cli_local_t             *local = NULL;
+        call_frame_t            *frame = NULL;
+        dict_t                  *dict = NULL;
+
+        if (-1 == req->rpc_status) {
+                goto out;
+        }
+
+        frame = myframe;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
+        if (ret < 0) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
+                goto out;
+        }
+
+        local = frame->local;
+        frame->local = NULL;
+
+        gf_log ("cli", GF_LOG_INFO, "Received resp to uuid reset");
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_dict ("uuidReset", dict, rsp.op_ret,
+                                           rsp.op_errno, rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
 
         if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
+                cli_err ("%s", rsp.op_errstr);
         else
-                cli_out ("Deleting volume %s has been %s", volname,
+                cli_out ("resetting the peer uuid has been %s",
                          (rsp.op_ret) ? "unsuccessful": "successful");
         ret = rsp.op_ret;
 
 out:
         cli_cmd_broadcast_response (ret);
         cli_local_wipe (local);
-        if (rsp.volname)
-                free (rsp.volname);
+        if (rsp.dict.dict_val)
+                free (rsp.dict.dict_val);
+        if (dict)
+                dict_unref (dict);
+
         gf_log ("", GF_LOG_INFO, "Returning with %d", ret);
         return ret;
 }
 
 int
-gf_cli3_1_start_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_start_volume_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf1_cli_start_vol_rsp   rsp   = {0,};
-        int                     ret   = 0;
+        gf_cli_rsp              rsp   = {0,};
+        int                     ret   = -1;
         cli_local_t             *local = NULL;
         char                    *volname = NULL;
         call_frame_t            *frame = NULL;
+        dict_t                  *dict = NULL;
+        dict_t                  *rsp_dict = NULL;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_start_vol_rsp);
+        frame = myframe;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
-        frame = myframe;
-
-        if (frame) {
+        if (frame)
                 local = frame->local;
-                frame->local = NULL;
-        }
 
         if (local)
-                volname = local->u.start_vol.volname;
+                dict = local->dict;
+
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret) {
+                gf_log (frame->this->name, GF_LOG_ERROR, "dict get failed");
+                goto out;
+        }
 
         gf_log ("cli", GF_LOG_INFO, "Received resp to start volume");
 
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                if (rsp.op_ret == 0) {
+                        rsp_dict = dict_new ();
+                        ret = dict_unserialize (rsp.dict.dict_val,
+                                                rsp.dict.dict_len,
+                                                &rsp_dict);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Failed rsp_dict unserialization");
+                                goto out;
+                        }
+                }
+
+                ret = cli_xml_output_generic_volume ("volStart", rsp_dict,
+                                                     rsp.op_ret, rsp.op_errno,
+                                                     rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
         if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
+                cli_err ("volume start: %s: failed: %s", volname,
+                         rsp.op_errstr);
+        else if (rsp.op_ret)
+                cli_err ("volume start: %s: failed", volname);
         else
-                cli_out ("Starting volume %s has been %s", volname,
-                        (rsp.op_ret) ? "unsuccessful": "successful");
+                cli_out ("volume start: %s: success", volname);
 
         ret = rsp.op_ret;
 
 out:
         cli_cmd_broadcast_response (ret);
-        if (local)
-                cli_local_wipe (local);
-        if (rsp.volname)
-                free (rsp.volname);
-        if (rsp.op_errstr)
-                free (rsp.op_errstr);
+        free (rsp.dict.dict_val);
+        free (rsp.op_errstr);
         return ret;
 }
 
 int
-gf_cli3_1_stop_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_stop_volume_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf1_cli_stop_vol_rsp  rsp   = {0,};
-        int                   ret   = 0;
+        gf_cli_rsp            rsp   = {0,};
+        int                   ret   = -1;
         cli_local_t           *local = NULL;
         char                  *volname = NULL;
         call_frame_t          *frame = NULL;
+        dict_t                *dict = NULL;
+        dict_t                *rsp_dict = NULL;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_stop_vol_rsp);
+        frame = myframe;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
-        frame = myframe;
-
         if (frame)
                 local = frame->local;
 
-        if (local)
-                volname = local->u.start_vol.volname;
+        if (local) {
+                dict = local->dict;
+                ret = dict_get_str (dict, "volname", &volname);
+                if (ret) {
+                        gf_log (frame->this->name, GF_LOG_ERROR,
+                                "Unable to get volname from dict");
+                        goto out;
+                }
+        }
 
         gf_log ("cli", GF_LOG_INFO, "Received resp to stop volume");
 
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                if (rsp.op_ret == 0) {
+                        rsp_dict = dict_new ();
+                        ret = dict_unserialize (rsp.dict.dict_val,
+                                                rsp.dict.dict_len,
+                                                &rsp_dict);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Failed rsp_dict unserialization");
+                                goto out;
+                        }
+                }
+
+                ret = cli_xml_output_generic_volume ("volStop", rsp_dict,
+                                                     rsp.op_ret, rsp.op_errno,
+                                                     rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
         if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
+                cli_err ("volume stop: %s: failed: %s", volname, rsp.op_errstr);
+        else if (rsp.op_ret)
+                cli_err ("volume stop: %s: failed", volname);
         else
-                cli_out ("Stopping volume %s has been %s", volname,
-                        (rsp.op_ret) ? "unsuccessful": "successful");
+                cli_out ("volume stop: %s: success", volname);
+
         ret = rsp.op_ret;
 
 out:
         cli_cmd_broadcast_response (ret);
-        if (rsp.op_errstr)
-                free (rsp.op_errstr);
-        if (rsp.volname)
-                free (rsp.volname);
+        free (rsp.op_errstr);
+        free (rsp.dict.dict_val);
+
         return ret;
 }
 
 int
-gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf2_cli_defrag_vol_rsp  rsp     = {0,};
+        gf_cli_rsp               rsp     = {0,};
         cli_local_t             *local   = NULL;
         char                    *volname = NULL;
         call_frame_t            *frame   = NULL;
         char                    *status  = "unknown";
         int                      cmd     = 0;
-        int                      ret     = 0;
+        int                      ret     = -1;
+        dict_t                  *dict    = NULL;
+        dict_t                  *local_dict = NULL;
+        uint64_t                 files   = 0;
+        uint64_t                 size    = 0;
+        uint64_t                 lookup  = 0;
+        char                     msg[1024] = {0,};
+        gf_defrag_status_t       status_rcd = GF_DEFRAG_STATUS_NOT_STARTED;
+        int32_t                  counter = 0;
+        char                    *node_name = NULL;
+        char                     key[256] = {0,};
+        int32_t                  i = 1;
+        uint64_t                 failures = 0;
+        uint64_t                 skipped = 0;
+        double                   elapsed = 0;
+        char                    *size_str = NULL;
+        char                    *task_id_str = NULL;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
+        frame = myframe;
+
         ret = xdr_to_generic (*iov, &rsp,
-                              (xdrproc_t)xdr_gf2_cli_defrag_vol_rsp);
+                              (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
-        frame = myframe;
-
         if (frame)
                 local = frame->local;
 
-        if (local) {
-                volname = local->u.defrag_vol.volname;
-                cmd = local->u.defrag_vol.cmd;
+        if (local)
+                local_dict = local->dict;
+
+        ret = dict_get_str (local_dict, "volname", &volname);
+        if (ret) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to get volname");
+                goto out;
+        }
+
+        ret = dict_get_int32 (local_dict, "rebalance-command", (int32_t*)&cmd);
+        if (ret) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to get command");
+                goto out;
+        }
+
+        if (rsp.dict.dict_len) {
+                /* Unserialize the dictionary */
+                dict  = dict_new ();
+
+                ret = dict_unserialize (rsp.dict.dict_val,
+                                        rsp.dict.dict_len,
+                                        &dict);
+                if (ret < 0) {
+                        gf_log ("glusterd", GF_LOG_ERROR,
+                                "failed to "
+                                "unserialize req-buffer to dictionary");
+                        goto out;
+                }
+        }
+
+        if (!((cmd == GF_DEFRAG_CMD_STOP) || (cmd == GF_DEFRAG_CMD_STATUS)) &&
+             !(global_state->mode & GLUSTER_MODE_XML)) {
+                /* All other possibilites are about starting a rebalance */
+                ret = dict_get_str (dict, GF_REBALANCE_TID_KEY, &task_id_str);
+                if (rsp.op_ret && strcmp (rsp.op_errstr, "")) {
+                        snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
+                } else {
+                        if (!rsp.op_ret) {
+                                snprintf (msg, sizeof (msg),
+                                          "Starting rebalance on volume %s has "
+                                          "been successful.\nID: %s", volname,
+                                          task_id_str);
+                        } else {
+                                snprintf (msg, sizeof (msg),
+                                          "Starting rebalance on volume %s has "
+                                          "been unsuccessful.", volname);
+                        }
+                }
+                goto done;
         }
 
         if (cmd == GF_DEFRAG_CMD_STOP) {
                 if (rsp.op_ret == -1) {
                         if (strcmp (rsp.op_errstr, ""))
-                                cli_out ("%s", rsp.op_errstr);
+                                snprintf (msg, sizeof (msg),
+                                          "%s", rsp.op_errstr);
                         else
-                                cli_out ("rebalance volume %s stop failed",
-                                         volname);
+                                snprintf (msg, sizeof (msg),
+                                          "rebalance volume %s stop failed",
+                                          volname);
+                        goto done;
                 } else {
-                        cli_out ("stopped rebalance process of volume %s \n"
-                                 "(after rebalancing %"PRId64" files totaling "
-                                 "%"PRId64" bytes)", volname, rsp.files, rsp.size);
+                        snprintf (msg, sizeof (msg),
+                                  "rebalance process may be in the middle of a "
+                                  "file migration.\nThe process will be fully "
+                                  "stopped once the migration of the file is "
+                                  "complete.\nPlease check rebalance process "
+                                  "for completion before doing any further "
+                                  "brick related tasks on the volume.");
                 }
-                goto done;
         }
         if (cmd == GF_DEFRAG_CMD_STATUS) {
                 if (rsp.op_ret == -1) {
                         if (strcmp (rsp.op_errstr, ""))
-                                cli_out ("%s", rsp.op_errstr);
+                                snprintf (msg, sizeof (msg),
+                                          "%s", rsp.op_errstr);
                         else
-                                cli_out ("failed to get the status of "
-                                         "rebalance process");
+                                snprintf (msg, sizeof (msg),
+                                          "Failed to get the status of "
+                                          "rebalance process");
                         goto done;
                 }
+        }
 
-                switch (rsp.op_errno) {
-                case GF_DEFRAG_STATUS_NOT_STARTED:
-                        status = "not started";
-                        break;
-                case GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED:
-                        status = "step 1: layout fix in progress";
-                        break;
-                case GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED:
-                        status = "step 2: data migration in progress";
-                        break;
-                case GF_DEFRAG_STATUS_STOPPED:
-                        status = "stopped";
-                        break;
-                case GF_DEFRAG_STATUS_COMPLETE:
-                        status = "completed";
-                        break;
-                case GF_DEFRAG_STATUS_FAILED:
-                        status = "failed";
-                        break;
-                case GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE:
-                        status = "step 1: layout fix complete";
-                        break;
-                case GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE:
-                        status = "step 2: data migration complete";
-                        break;
-                case GF_DEFRAG_STATUS_PAUSED:
-                        status = "paused";
-                        break;
-                }
-                if (rsp.files && (rsp.op_errno == 1)) {
-                        cli_out ("rebalance %s: fixed layout %"PRId64,
-                                 status, rsp.files);
-                        goto done;
-                }
-                if (rsp.files && (rsp.op_errno == 6)) {
-                        cli_out ("rebalance %s: fixed layout %"PRId64,
-                                 status, rsp.files);
-                        goto done;
-                }
-                if (rsp.files) {
-                        cli_out ("rebalance %s: rebalanced %"PRId64
-                                 " files of size %"PRId64" (total files"
-                                 " scanned %"PRId64")", status,
-                                 rsp.files, rsp.size, rsp.lookedup_files);
-                        goto done;
-                }
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_vol_rebalance (cmd, dict, rsp.op_ret,
+                                                    rsp.op_errno,
+                                                    rsp.op_errstr);
+                goto out;
+        }
 
-                cli_out ("rebalance %s", status);
-                goto done;
+        ret = dict_get_int32 (dict, "count", &counter);
+        if (ret) {
+                gf_log (frame->this->name, GF_LOG_ERROR, "count not set");
+                goto out;
         }
 
-        /* All other possibility is about starting a volume */
-        if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
-        else
-                cli_out ("starting rebalance on volume %s has been %s",
-                         volname, (rsp.op_ret) ? "unsuccessful":
-                         "successful");
+        cli_out ("%40s %16s %13s %13s %13s %13s %20s %18s", "Node",
+                 "Rebalanced-files", "size", "scanned", "failures", "skipped",
+                 "status", "run time in secs");
+        cli_out ("%40s %16s %13s %13s %13s %13s %20s %18s", "---------",
+                 "-----------", "-----------", "-----------", "-----------",
+                 "-----------", "------------", "--------------");
+        do {
+                snprintf (key, 256, "node-name-%d", i);
+                ret = dict_get_str (dict, key, &node_name);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get node-name");
 
-done:
-        if (volname)
-                GF_FREE (volname);
+                memset (key, 0, 256);
+                snprintf (key, 256, "files-%d", i);
+                ret = dict_get_uint64 (dict, key, &files);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get file count");
+
+                memset (key, 0, 256);
+                snprintf (key, 256, "size-%d", i);
+                ret = dict_get_uint64 (dict, key, &size);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get size of xfer");
+
+                memset (key, 0, 256);
+                snprintf (key, 256, "lookups-%d", i);
+                ret = dict_get_uint64 (dict, key, &lookup);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get lookedup file count");
+
+                memset (key, 0, 256);
+                snprintf (key, 256, "status-%d", i);
+                ret = dict_get_int32 (dict, key, (int32_t *)&status_rcd);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get status");
+
+                memset (key, 0, 256);
+                snprintf (key, 256, "failures-%d", i);
+                ret = dict_get_uint64 (dict, key, &failures);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get failures count");
+
+                memset (key, 0, 256);
+                snprintf (key, 256, "skipped-%d", i);
+                ret = dict_get_uint64 (dict, key, &skipped);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get skipped count");
+                memset (key, 0, 256);
+                snprintf (key, 256, "run-time-%d", i);
+                ret = dict_get_double (dict, key, &elapsed);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get run-time");
+
+                status = cli_vol_task_status_str[status_rcd];
+                size_str = gf_uint64_2human_readable(size);
+                cli_out ("%40s %16"PRIu64 " %13s" " %13"PRIu64 " %13"PRIu64
+                         " %13"PRIu64 " %20s %18.2f", node_name, files,
+                         size_str, lookup, failures, skipped, status, elapsed);
+                GF_FREE(size_str);
 
+                i++;
+        } while (i <= counter);
+
+
+done:
+        if (global_state->mode & GLUSTER_MODE_XML)
+                cli_xml_output_str ("volRebalance", msg,
+                                    rsp.op_ret, rsp.op_errno,
+                                    rsp.op_errstr);
+        else {
+                if (rsp.op_ret)
+                        cli_err ("volume rebalance: %s: failed: %s", volname,
+                                 msg);
+                else
+                        cli_out ("volume rebalance: %s: success: %s", volname,
+                                 msg);
+        }
         ret = rsp.op_ret;
 
 out:
-        if (rsp.op_errstr)
-                free (rsp.op_errstr); //malloced by xdr
-        if (rsp.volname)
-                free (rsp.volname); //malloced by xdr
+        free (rsp.op_errstr); //malloced by xdr
+        free (rsp.dict.dict_val); //malloced by xdr
+        if (dict)
+                dict_unref (dict);
         cli_cmd_broadcast_response (ret);
         return ret;
 }
 
 int
-gf_cli3_1_rename_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_rename_volume_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf1_cli_rename_vol_rsp  rsp   = {0,};
-        int                     ret   = 0;
+        gf_cli_rsp              rsp   = {0,};
+        int                     ret   = -1;
+        char                    msg[1024] = {0,};
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_rename_vol_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
 
         gf_log ("cli", GF_LOG_INFO, "Received resp to probe");
-        cli_out ("Rename volume %s", (rsp.op_ret) ? "unsuccessful":
-                                        "successful");
+        snprintf (msg, sizeof (msg), "Rename volume %s",
+                  (rsp.op_ret) ? "unsuccessful": "successful");
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_str ("volRename", msg, rsp.op_ret,
+                                          rsp.op_errno, rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (rsp.op_ret)
+                cli_err ("volume rename: failed");
+        else
+                cli_out ("volume rename: success");
 
         ret = rsp.op_ret;
 
@@ -966,61 +1519,125 @@ out:
 }
 
 int
-gf_cli3_1_reset_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_reset_volume_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf1_cli_reset_vol_rsp  rsp   = {0,};
-        int                  ret   = 0;
+        gf_cli_rsp           rsp   = {0,};
+        int                  ret   = -1;
+        char                 msg[1024] = {0,};
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_reset_vol_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
         gf_log ("cli", GF_LOG_INFO, "Received resp to reset");
 
-        if (rsp.op_ret &&  strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
+        if (strcmp (rsp.op_errstr, ""))
+                snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
         else
-                cli_out ("reset volume %s", (rsp.op_ret) ? "unsuccessful":
-                                "successful");
+                snprintf (msg, sizeof (msg), "reset volume %s",
+                          (rsp.op_ret) ? "unsuccessful": "successful");
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_str ("volReset", msg, rsp.op_ret,
+                                          rsp.op_errno, rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (rsp.op_ret)
+                cli_err ("volume reset: failed: %s", msg);
+        else
+                cli_out ("volume reset: success: %s", msg);
 
         ret = rsp.op_ret;
 
 out:
-                cli_cmd_broadcast_response (ret);
+        cli_cmd_broadcast_response (ret);
         return ret;
 }
 
+char *
+is_server_debug_xlator (void *myframe)
+{
+        call_frame_t         *frame        = NULL;
+        cli_local_t          *local        = NULL;
+        char                 **words       = NULL;
+        char                 *key          = NULL;
+        char                 *value        = NULL;
+        char                 *debug_xlator = NULL;
+
+        frame = myframe;
+        local = frame->local;
+        words = (char **)local->words;
+
+        while (*words != NULL) {
+                if (strstr (*words, "trace") == NULL &&
+                    strstr (*words, "error-gen") == NULL) {
+                        words++;
+                        continue;
+                }
+
+                key = *words;
+                words++;
+                value = *words;
+                if (value == NULL)
+                        break;
+                if (strstr (value, "client")) {
+                        words++;
+                        continue;
+                } else {
+                        if (!(strstr (value, "posix") || strstr (value, "acl")
+                              || strstr (value, "locks") ||
+                              strstr (value, "io-threads") ||
+                              strstr (value, "marker") ||
+                              strstr (value, "index"))) {
+                                words++;
+                                continue;
+                        } else {
+                                debug_xlator = gf_strdup (key);
+                                break;
+                        }
+                }
+        }
+
+        return debug_xlator;
+}
+
 int
-gf_cli3_1_set_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_set_volume_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf1_cli_set_vol_rsp  rsp   = {0,};
-        int                  ret   = 0;
+        gf_cli_rsp           rsp   = {0,};
+        int                  ret   = -1;
         dict_t               *dict = NULL;
         char                 *help_str = NULL;
+        char                 msg[1024] = {0,};
+        char                 *debug_xlator = _gf_false;
+        char                 tmp_str[512] = {0,};
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_set_vol_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
         gf_log ("cli", GF_LOG_INFO, "Received resp to set");
 
-        if (rsp.op_ret &&  strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
-
         dict = dict_new ();
 
         if (!dict) {
@@ -1030,36 +1647,73 @@ gf_cli3_1_set_volume_cbk (struct rpc_req *req, struct iovec *iov,
 
         ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len, &dict);
 
-        if (ret)
+        /* For brick processes graph change does not happen on the fly.
+         * The proces has to be restarted. So this is a check from the
+         * volume set option such that if debug xlators such as trace/errorgen
+         * are provided in the set command, warn the user.
+         */
+        debug_xlator = is_server_debug_xlator (myframe);
+
+        if (dict_get_str (dict, "help-str", &help_str) && !msg[0])
+                snprintf (msg, sizeof (msg), "Set volume %s",
+                          (rsp.op_ret) ? "unsuccessful": "successful");
+        if (rsp.op_ret == 0 && debug_xlator) {
+                snprintf (tmp_str, sizeof (tmp_str), "\n%s translator has been "
+                          "added to the server volume file. Please restart the"
+                          " volume for enabling the translator", debug_xlator);
+        }
+
+        if ((global_state->mode & GLUSTER_MODE_XML) && (help_str == NULL)) {
+                ret = cli_xml_output_str ("volSet", msg, rsp.op_ret,
+                                          rsp.op_errno, rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
                 goto out;
+        }
 
-        if (dict_get_str (dict, "help-str", &help_str))
-                cli_out ("Set volume %s", (rsp.op_ret) ? "unsuccessful":
-                                                         "successful");
-        else 
-                cli_out ("%s", help_str);
+        if (rsp.op_ret) {
+                if (strcmp (rsp.op_errstr, ""))
+                        cli_err ("volume set: failed: %s", rsp.op_errstr);
+                else
+                        cli_err ("volume set: failed");
+        } else {
+                if (help_str == NULL) {
+                        if (debug_xlator == NULL)
+                                cli_out ("volume set: success");
+                        else
+                                cli_out ("volume set: success%s", tmp_str);
+                }else {
+                        cli_out ("%s", help_str);
+                }
+        }
 
         ret = rsp.op_ret;
 
 out:
+        if (dict)
+                dict_unref (dict);
+        GF_FREE (debug_xlator);
         cli_cmd_broadcast_response (ret);
         return ret;
 }
 
 int
-gf_cli3_1_add_brick_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_add_brick_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf1_cli_add_brick_rsp       rsp   = {0,};
-        int                         ret   = 0;
+        gf_cli_rsp                  rsp   = {0,};
+        int                         ret   = -1;
+        char                        msg[1024] = {0,};
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_add_brick_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
@@ -1067,18 +1721,30 @@ gf_cli3_1_add_brick_cbk (struct rpc_req *req, struct iovec *iov,
         gf_log ("cli", GF_LOG_INFO, "Received resp to add brick");
 
         if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
+                snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
         else
-                cli_out ("Add Brick %s", (rsp.op_ret) ? "unsuccessful":
-                                                        "successful");
+                snprintf (msg, sizeof (msg), "Add Brick %s",
+                          (rsp.op_ret) ? "unsuccessful": "successful");
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_str ("volAddBrick", msg, rsp.op_ret,
+                                          rsp.op_errno, rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (rsp.op_ret)
+                cli_err ("volume add-brick: failed: %s", rsp.op_errstr);
+        else
+                cli_out ("volume add-brick: success");
         ret = rsp.op_ret;
 
 out:
         cli_cmd_broadcast_response (ret);
-        if (rsp.volname)
-                free (rsp.volname);
-        if (rsp.op_errstr)
-                free (rsp.op_errstr);
+        free (rsp.dict.dict_val);
+        free (rsp.op_errstr);
         return ret;
 }
 
@@ -1086,128 +1752,348 @@ int
 gf_cli3_remove_brick_status_cbk (struct rpc_req *req, struct iovec *iov,
                                  int count, void *myframe)
 {
-        gf2_cli_defrag_vol_rsp  rsp     = {0,};
+        gf_cli_rsp               rsp     = {0,};
         char                    *status  = "unknown";
-        int                      ret     = 0;
+        int                      ret     = -1;
+        uint64_t                 files   = 0;
+        uint64_t                 size    = 0;
+        uint64_t                 lookup  = 0;
+        dict_t                  *dict    = NULL;
+        char                     msg[1024] = {0,};
+        char                     key[256] = {0,};
+        int32_t                  i       = 1;
+        int32_t                  counter = 0;
+        char                    *node_name = 0;
+        gf_defrag_status_t       status_rcd = GF_DEFRAG_STATUS_NOT_STARTED;
+        uint64_t                 failures = 0;
+        uint64_t                 skipped = 0;
+        double                   elapsed = 0;
+        char                    *size_str = NULL;
+        int32_t                  command = 0;
+        gf1_op_commands          cmd = GF_OP_CMD_NONE;
+        cli_local_t             *local = NULL;
+        call_frame_t            *frame = NULL;
+        char                    *cmd_str = "unknown";
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
+        frame = myframe;
+
         ret = xdr_to_generic (*iov, &rsp,
-                              (xdrproc_t)xdr_gf2_cli_defrag_vol_rsp);
+                              (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
-        ret = rsp.op_ret;
-        if (rsp.op_ret == -1) {
-                if (strcmp (rsp.op_errstr, ""))
-                        cli_out ("%s", rsp.op_errstr);
-                else
-                        cli_out ("failed to get the status of "
-                                 "remove-brick process");
+        if (frame)
+                local = frame->local;
+        ret = dict_get_int32 (local->dict, "command", &command);
+        if (ret)
                 goto out;
-        }
+        cmd = command;
 
-        switch (rsp.op_errno) {
-        case GF_DEFRAG_STATUS_NOT_STARTED:
-                status = "not started";
-                break;
-        case GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED:
-        case GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED:
-        case GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE:
-                status = "in progress";
+        switch (cmd) {
+        case GF_OP_CMD_STOP:
+                cmd_str = "stop";
                 break;
-        case GF_DEFRAG_STATUS_STOPPED:
-                status = "stopped";
+        case GF_OP_CMD_STATUS:
+                cmd_str = "status";
                 break;
-        case GF_DEFRAG_STATUS_COMPLETE:
-        case GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE:
-                status = "completed";
-                break;
-        case GF_DEFRAG_STATUS_FAILED:
-                status = "failed";
-                break;
-        case GF_DEFRAG_STATUS_PAUSED:
-                status = "paused";
+        default:
                 break;
         }
 
-        if (rsp.files && (rsp.op_errno == 1)) {
-                cli_out ("remove-brick %s: fixed layout %"PRId64,
-                         status, rsp.files);
+        ret = rsp.op_ret;
+        if (rsp.op_ret == -1) {
+                if (strcmp (rsp.op_errstr, ""))
+                        snprintf (msg, sizeof (msg), "volume remove-brick %s: "
+                                  "failed: %s", cmd_str, rsp.op_errstr);
+                else
+                        snprintf (msg, sizeof (msg), "volume remove-brick %s: "
+                                  "failed", cmd_str);
+
+                if (global_state->mode & GLUSTER_MODE_XML)
+                        goto xml_output;
+
+                cli_err ("%s", msg);
                 goto out;
         }
-        if (rsp.files && (rsp.op_errno == 6)) {
-                cli_out ("remove-brick %s: fixed layout %"PRId64,
-                         status, rsp.files);
+
+        if (rsp.dict.dict_len) {
+                /* Unserialize the dictionary */
+                dict  = dict_new ();
+
+                ret = dict_unserialize (rsp.dict.dict_val,
+                                        rsp.dict.dict_len,
+                                        &dict);
+                if (ret < 0) {
+                        strncpy (msg, "failed to unserialize req-buffer to "
+                                 "dictionary", sizeof (msg));
+
+                        if (global_state->mode & GLUSTER_MODE_XML) {
+                                rsp.op_ret = -1;
+                                goto xml_output;
+                        }
+
+                        gf_log ("cli", GF_LOG_ERROR, "%s", msg);
+                        goto out;
+                }
+        }
+
+xml_output:
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                if (strcmp (rsp.op_errstr, "")) {
+                        ret = cli_xml_output_vol_remove_brick (_gf_true, dict,
+                                                               rsp.op_ret,
+                                                               rsp.op_errno,
+                                                               rsp.op_errstr);
+                } else {
+                        ret = cli_xml_output_vol_remove_brick (_gf_true, dict,
+                                                               rsp.op_ret,
+                                                               rsp.op_errno,
+                                                               msg);
+                }
                 goto out;
         }
-        if (rsp.files) {
-                cli_out ("remove-brick %s: decommissioned %"PRId64
-                         " files of size %"PRId64, status,
-                         rsp.files, rsp.size);
+
+        ret = dict_get_int32 (dict, "count", &counter);
+        if (ret) {
+                gf_log (frame->this->name, GF_LOG_ERROR, "count not set");
                 goto out;
         }
 
-        cli_out ("remove-brick %s", status);
+
+        cli_out ("%40s %16s %13s %13s %13s %13s %14s %s", "Node",
+                 "Rebalanced-files", "size", "scanned", "failures", "skipped",
+                 "status", "run-time in secs");
+        cli_out ("%40s %16s %13s %13s %13s %13s %14s %16s", "---------",
+                 "-----------", "-----------", "-----------", "-----------",
+                  "-----------","------------", "--------------");
+
+        do {
+                snprintf (key, 256, "node-name-%d", i);
+                ret = dict_get_str (dict, key, &node_name);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get node-name");
+
+                memset (key, 0, 256);
+                snprintf (key, 256, "files-%d", i);
+                ret = dict_get_uint64 (dict, key, &files);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get file count");
+
+                memset (key, 0, 256);
+                snprintf (key, 256, "size-%d", i);
+                ret = dict_get_uint64 (dict, key, &size);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get size of xfer");
+
+                memset (key, 0, 256);
+                snprintf (key, 256, "lookups-%d", i);
+                ret = dict_get_uint64 (dict, key, &lookup);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get lookedup file count");
+
+                memset (key, 0, 256);
+                snprintf (key, 256, "status-%d", i);
+                ret = dict_get_int32 (dict, key, (int32_t *)&status_rcd);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "failed to get status");
+
+                snprintf (key, 256, "failures-%d", i);
+                ret = dict_get_uint64 (dict, key, &failures);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "Failed to get failure on files");
+
+                snprintf (key, 256, "failures-%d", i);
+                ret = dict_get_uint64 (dict, key, &skipped);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "Failed to get skipped files");
+                memset (key, 0, 256);
+                snprintf (key, 256, "run-time-%d", i);
+                ret = dict_get_double (dict, key, &elapsed);
+                if (ret)
+                        gf_log (frame->this->name, GF_LOG_TRACE,
+                                "Failed to get run-time");
+
+                switch (status_rcd) {
+                case GF_DEFRAG_STATUS_NOT_STARTED:
+                        status = "not started";
+                        break;
+                case GF_DEFRAG_STATUS_STARTED:
+                        status = "in progress";
+                        break;
+                case GF_DEFRAG_STATUS_STOPPED:
+                        status = "stopped";
+                        break;
+                case GF_DEFRAG_STATUS_COMPLETE:
+                        status = "completed";
+                        break;
+                case GF_DEFRAG_STATUS_FAILED:
+                        status = "failed";
+                        break;
+                default:
+                        break;
+                }
+
+                size_str = gf_uint64_2human_readable(size);
+                
+		if (strcmp (status, "not started")) {
+			cli_out ("%40s %16"PRIu64 " %13s" " %13"PRIu64 " %13"
+				PRIu64 " %13"PRIu64 " %14s %16.2f", node_name,
+                                files, size_str, lookup, failures, skipped,
+                                status, elapsed);
+		}
+                GF_FREE(size_str);
+
+                i++;
+        } while (i <= counter);
+
+        if ((cmd == GF_OP_CMD_STOP) && (rsp.op_ret == 0)) {
+                cli_out ("'remove-brick' process may be in the middle of a "
+                         "file migration.\nThe process will be fully stopped "
+                         "once the migration of the file is complete.\nPlease "
+                         "check remove-brick process for completion before "
+                         "doing any further brick related tasks on the "
+                         "volume.");
+        }
 
 out:
-        if (rsp.op_errstr)
-                free (rsp.op_errstr); //malloced by xdr
-        if (rsp.volname)
-                free (rsp.volname); //malloced by xdr
+        free (rsp.dict.dict_val); //malloced by xdr
+        if (dict)
+                dict_unref (dict);
         cli_cmd_broadcast_response (ret);
         return ret;
 }
 
 
 int
-gf_cli3_1_remove_brick_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_remove_brick_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf1_cli_remove_brick_rsp        rsp   = {0,};
-        int                             ret   = 0;
+        gf_cli_rsp                      rsp   = {0,};
+        int                             ret   = -1;
+        char                            msg[1024] = {0,};
+        gf1_op_commands                 cmd = GF_OP_CMD_NONE;
+        char                           *cmd_str = "unknown";
+        cli_local_t                    *local = NULL;
+        call_frame_t                   *frame = NULL;
+        char                           *task_id_str = NULL;
+        dict_t                         *rsp_dict = NULL;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_remove_brick_rsp);
+        frame = myframe;
+        local = frame->local;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
+        ret = dict_get_int32 (local->dict, "command", (int32_t *)&cmd);
+        if (ret) {
+                 gf_log ("", GF_LOG_ERROR, "failed to get command");
+                 goto out;
+        }
+
+        if (rsp.dict.dict_len) {
+                rsp_dict = dict_new ();
+                if (!rsp_dict) {
+                        ret = -1;
+                        goto out;
+                }
+
+                ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len,
+                                        &rsp_dict);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Failed to unserialize rsp_dict");
+                        goto out;
+                }
+        }
+
+        switch (cmd) {
+        case GF_OP_CMD_START:
+                cmd_str = "start";
+
+                ret = dict_get_str (rsp_dict, GF_REMOVE_BRICK_TID_KEY, &task_id_str);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "remove-brick-id is not present in dict");
+                }
+                break;
+        case GF_OP_CMD_COMMIT:
+                cmd_str = "commit";
+                break;
+        case GF_OP_CMD_COMMIT_FORCE:
+                cmd_str = "commit force";
+                break;
+        default:
+                cmd_str = "unknown";
+                break;
+        }
+
         gf_log ("cli", GF_LOG_INFO, "Received resp to remove brick");
 
         if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
+                snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
         else
-                cli_out ("Remove Brick %s", (rsp.op_ret) ? "unsuccessful":
-                                                           "successful");
+                snprintf (msg, sizeof (msg), "Remove Brick %s %s", cmd_str,
+                          (rsp.op_ret) ? "unsuccessful": "successful");
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_vol_remove_brick (_gf_false, rsp_dict,
+                                                       rsp.op_ret, rsp.op_errno,
+                                                       msg);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (rsp.op_ret) {
+                cli_err ("volume remove-brick %s: failed: %s", cmd_str,
+                         msg);
+        } else {
+                cli_out ("volume remove-brick %s: success", cmd_str);
+                if (GF_OP_CMD_START == cmd && task_id_str != NULL)
+                        cli_out ("ID: %s", task_id_str);
+        }
 
         ret = rsp.op_ret;
 
 out:
         cli_cmd_broadcast_response (ret);
-        if (rsp.volname)
-                free (rsp.volname);
-        if (rsp.op_errstr)
-                free (rsp.op_errstr);
+        free (rsp.dict.dict_val);
+        free (rsp.op_errstr);
+
         return ret;
 }
 
 
 
 int
-gf_cli3_1_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf1_cli_replace_brick_rsp        rsp              = {0,};
-        int                              ret              = 0;
+        gf_cli_rsp                       rsp              = {0,};
+        int                              ret              = -1;
         cli_local_t                     *local            = NULL;
         call_frame_t                    *frame            = NULL;
         dict_t                          *dict             = NULL;
@@ -1216,6 +2102,9 @@ gf_cli3_1_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
         char                            *status_reply     = NULL;
         gf1_cli_replace_op               replace_op       = 0;
         char                            *rb_operation_str = NULL;
+        dict_t                          *rsp_dict         = NULL;
+        char                             msg[1024]        = {0,};
+        char                            *task_id_str      = NULL;
 
         if (-1 == req->rpc_status) {
                 goto out;
@@ -1223,15 +2112,16 @@ gf_cli3_1_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
 
         frame = (call_frame_t *) myframe;
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_replace_brick_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
         local = frame->local;
         GF_ASSERT (local);
-        dict = local->u.replace_brick.dict;
+        dict = local->dict;
 
         ret = dict_get_int32 (dict, "operation", (int32_t *)&replace_op);
         if (ret) {
@@ -1240,36 +2130,77 @@ gf_cli3_1_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
                 goto out;
         }
 
+        if (rsp.dict.dict_len) {
+                /* Unserialize the dictionary */
+                rsp_dict  = dict_new ();
+
+                ret = dict_unserialize (rsp.dict.dict_val,
+                                rsp.dict.dict_len,
+                                &rsp_dict);
+                if (ret < 0) {
+                        gf_log ("glusterd", GF_LOG_ERROR,
+                                        "failed to "
+                                        "unserialize rsp buffer to dictionary");
+                        goto out;
+                }
+        }
+
         switch (replace_op) {
         case GF_REPLACE_OP_START:
-                if (rsp.op_ret)
-                        rb_operation_str = "replace-brick failed to start";
-                else
-                        rb_operation_str = "replace-brick started successfully";
+                if (rsp.op_ret) {
+                        rb_operation_str = gf_strdup ("replace-brick failed to"
+                                                      " start");
+                } else {
+                        ret = dict_get_str (rsp_dict, GF_REPLACE_BRICK_TID_KEY,
+                                            &task_id_str);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR, "Failed to get "
+                                        "\"replace-brick-id\" from dict");
+                                goto out;
+                        }
+                        ret = gf_asprintf (&rb_operation_str,
+                                           "replace-brick started successfully"
+                                           "\nID: %s", task_id_str);
+                        if (ret < 0)
+                                goto out;
+                }
                 break;
 
         case GF_REPLACE_OP_STATUS:
 
-                status_reply = rsp.status;
-                if (rsp.op_ret || ret)
-                        rb_operation_str = "replace-brick status unknown";
-                else
-                        rb_operation_str = status_reply;
+                if (rsp.op_ret || ret) {
+                        rb_operation_str = gf_strdup ("replace-brick status "
+                                                      "unknown");
+                } else {
+                        ret = dict_get_str (rsp_dict, "status-reply",
+                                            &status_reply);
+                        if (ret) {
+                                gf_log (frame->this->name, GF_LOG_ERROR, "failed to"
+                                        "get status");
+                                goto out;
+                        }
+
+                        rb_operation_str = gf_strdup (status_reply);
+                }
 
                 break;
 
         case GF_REPLACE_OP_PAUSE:
                 if (rsp.op_ret)
-                        rb_operation_str = "replace-brick pause failed";
+                        rb_operation_str = gf_strdup ("replace-brick pause "
+                                                      "failed");
                 else
-                        rb_operation_str = "replace-brick paused successfully";
+                        rb_operation_str = gf_strdup ("replace-brick paused "
+                                                      "successfully");
                 break;
 
         case GF_REPLACE_OP_ABORT:
                 if (rsp.op_ret)
-                        rb_operation_str = "replace-brick abort failed";
+                        rb_operation_str = gf_strdup ("replace-brick abort "
+                                                      "failed");
                 else
-                        rb_operation_str = "replace-brick aborted successfully";
+                        rb_operation_str = gf_strdup ("replace-brick aborted "
+                                                      "successfully");
                 break;
 
         case GF_REPLACE_OP_COMMIT:
@@ -1290,9 +2221,11 @@ gf_cli3_1_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
 
 
                 if (rsp.op_ret || ret)
-                        rb_operation_str = "replace-brick commit failed";
+                        rb_operation_str = gf_strdup ("replace-brick commit "
+                                                      "failed");
                 else
-                        rb_operation_str = "replace-brick commit successful";
+                        rb_operation_str = gf_strdup ("replace-brick commit "
+                                                      "successful");
 
                 break;
 
@@ -1303,141 +2236,140 @@ gf_cli3_1_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,
         }
 
         if (rsp.op_ret && (strcmp (rsp.op_errstr, ""))) {
-                rb_operation_str = rsp.op_errstr;
+                rb_operation_str = gf_strdup (rsp.op_errstr);
         }
 
         gf_log ("cli", GF_LOG_INFO, "Received resp to replace brick");
-        cli_out ("%s",
-                 rb_operation_str ? rb_operation_str : "Unknown operation");
-
-        ret = rsp.op_ret;
-
-out:
-        if (local) {
-                dict_unref (local->u.replace_brick.dict);
-                GF_FREE (local->u.replace_brick.volname);
-                cli_local_wipe (local);
-        }
-
-        cli_cmd_broadcast_response (ret);
-        return ret;
-}
+        snprintf (msg, sizeof (msg), "%s",
+                  rb_operation_str ? rb_operation_str : "Unknown operation");
 
-static int
-gf_cli3_1_log_filename_cbk (struct rpc_req *req, struct iovec *iov,
-                            int count, void *myframe)
-{
-        gf1_cli_log_filename_rsp        rsp   = {0,};
-        int                             ret   = -1;
-
-        if (-1 == req->rpc_status) {
-                goto out;
-        }
-
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_log_filename_rsp);
-        if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_vol_replace_brick (replace_op, rsp_dict,
+                                                        rsp.op_ret,
+                                                        rsp.op_errno, msg);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
                 goto out;
         }
 
-        gf_log ("cli", GF_LOG_DEBUG, "Received resp to log filename");
-
-        if (rsp.op_ret && strcmp (rsp.errstr, ""))
-                cli_out ("%s", rsp.errstr);
+        if (rsp.op_ret)
+                cli_err ("volume replace-brick: failed: %s", msg);
         else
-                cli_out ("log filename : %s",
-                         (rsp.op_ret) ? "unsuccessful": "successful");
-
+                cli_out ("volume replace-brick: success: %s", msg);
         ret = rsp.op_ret;
 
 out:
-        cli_cmd_broadcast_response (ret);
-        return ret;
-}
-
-static int
-gf_cli3_1_log_locate_cbk (struct rpc_req *req, struct iovec *iov,
-                          int count, void *myframe)
-{
-        gf1_cli_log_locate_rsp rsp   = {0,};
-        int                    ret   = -1;
-
-        if (-1 == req->rpc_status) {
-                goto out;
-        }
+        if (frame)
+                frame->local = NULL;
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_log_locate_rsp);
-        if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
-                goto out;
+        if (local) {
+                dict_unref (local->dict);
+                cli_local_wipe (local);
         }
 
-        gf_log ("cli", GF_LOG_DEBUG, "Received resp to log locate");
-        cli_out ("log file location: %s", rsp.path);
+        if (rb_operation_str)
+                GF_FREE (rb_operation_str);
 
-        ret = rsp.op_ret;
-
-out:
         cli_cmd_broadcast_response (ret);
+        free (rsp.dict.dict_val);
+        if (rsp_dict)
+                dict_unref (rsp_dict);
+
         return ret;
 }
 
+
 static int
-gf_cli3_1_log_rotate_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_log_rotate_cbk (struct rpc_req *req, struct iovec *iov,
                           int count, void *myframe)
 {
-        gf1_cli_log_rotate_rsp rsp   = {0,};
+        gf_cli_rsp             rsp   = {0,};
         int                    ret   = -1;
+        char                   msg[1024] = {0,};
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_log_rotate_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
         gf_log ("cli", GF_LOG_DEBUG, "Received resp to log rotate");
 
-        if (rsp.op_ret && strcmp (rsp.errstr, ""))
-                cli_out ("%s", rsp.errstr);
+        if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
+                snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
         else
-                cli_out ("log rotate %s", (rsp.op_ret) ? "unsuccessful":
-                                                         "successful");
+                snprintf (msg, sizeof (msg), "log rotate %s",
+                          (rsp.op_ret) ? "unsuccessful": "successful");
 
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_str ("volLogRotate", msg, rsp.op_ret,
+                                          rsp.op_errno, rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (rsp.op_ret)
+                cli_err ("volume log-rotate: failed: %s", msg);
+        else
+                cli_out ("volume log-rotate: success");
         ret = rsp.op_ret;
 
 out:
         cli_cmd_broadcast_response (ret);
+        free (rsp.dict.dict_val);
+
         return ret;
 }
 
 static int
-gf_cli3_1_sync_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_sync_volume_cbk (struct rpc_req *req, struct iovec *iov,
                            int count, void *myframe)
 {
-        gf1_cli_sync_volume_rsp        rsp   = {0,};
+        gf_cli_rsp                     rsp   = {0,};
         int                            ret   = -1;
+        char                           msg[1024] = {0,};
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_sync_volume_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
         gf_log ("cli", GF_LOG_DEBUG, "Received resp to sync");
 
         if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
+                snprintf (msg, sizeof (msg), "volume sync: failed: %s",
+                          rsp.op_errstr);
         else
-                cli_out ("volume sync: %s",
-                         (rsp.op_ret) ? "unsuccessful": "successful");
+                snprintf (msg, sizeof (msg), "volume sync: %s",
+                          (rsp.op_ret) ? "failed": "success");
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_str ("volSync", msg, rsp.op_ret,
+                                          rsp.op_errno, rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (rsp.op_ret)
+                cli_err ("%s", msg);
+        else
+                cli_out ("%s", msg);
         ret = rsp.op_ret;
 
 out:
@@ -1446,12 +2378,12 @@ out:
 }
 
 int32_t
-gf_cli3_1_print_limit_list (char *volname, char *limit_list,
+gf_cli_print_limit_list (char *volname, char *limit_list,
                             char *op_errstr)
 {
         int64_t  size            = 0;
         int64_t  limit_value     = 0;
-        int32_t  i, j, k;
+        int32_t  i, j;
         int32_t  len = 0, ret    = -1;
         char     *size_str       = NULL;
         char     path [PATH_MAX] = {0, };
@@ -1459,6 +2391,7 @@ gf_cli3_1_print_limit_list (char *volname, char *limit_list,
         char     value [1024]    = {0, };
         char     mountdir []     = "/tmp/mntXXXXXX";
         char     abspath [PATH_MAX] = {0, };
+        char     *colon_ptr      = NULL;
         runner_t runner          = {0,};
 
         GF_VALIDATE_OR_GOTO ("cli", volname, out);
@@ -1469,7 +2402,7 @@ gf_cli3_1_print_limit_list (char *volname, char *limit_list,
 
         len = strlen (limit_list);
         if (len == 0) {
-                cli_out ("%s", op_errstr?op_errstr:"quota limit not set ");
+                cli_err ("%s", op_errstr?op_errstr:"quota limit not set ");
                 goto out;
         }
 
@@ -1495,7 +2428,7 @@ gf_cli3_1_print_limit_list (char *volname, char *limit_list,
 
         len = strlen (limit_list);
         if (len == 0) {
-                cli_out ("quota limit not set ");
+                cli_err ("quota limit not set ");
                 goto unmount;
         }
 
@@ -1506,19 +2439,16 @@ gf_cli3_1_print_limit_list (char *volname, char *limit_list,
                  "-----------------------");
         while (i < len) {
                 j = 0;
-                k = 0;
-
-                while (limit_list [i] != ':') {
-                        path [k++] = limit_list [i++];
-                }
-                path [k] = '\0';
-
-                i++;  //skip ':'
 
                 while (limit_list [i] != ',' && limit_list [i] != '\0') {
-                        value [j++] = limit_list[i++];
+                        path [j++] = limit_list[i++];
                 }
-                value [j] = '\0';
+                path [j] = '\0';
+                //here path[] contains both path and limit value
+
+                colon_ptr = strrchr (path, ':');
+                *colon_ptr = '\0';
+                strcpy (value, ++colon_ptr);
 
                 snprintf (abspath, sizeof (abspath), "%s/%s", mountdir, path);
 
@@ -1561,64 +2491,155 @@ out:
 }
 
 int
-gf_cli3_1_quota_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_quota_cbk (struct rpc_req *req, struct iovec *iov,
                      int count, void *myframe)
 {
-        gf1_cli_quota_rsp  rsp        = {0,};
-        int                ret        = 0;
+        gf_cli_rsp         rsp        = {0,};
+        int                ret        = -1;
+        dict_t            *dict       = NULL;
+        char              *volname    = NULL;
+        char              *limit_list = NULL;
+        int32_t            type       = 0;
+        char               msg[1024]  = {0,};
+        call_frame_t      *frame      = NULL;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_quota_rsp);
+        frame = myframe;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
         if (rsp.op_ret &&
             strcmp (rsp.op_errstr, "") == 0) {
-                cli_out ("command unsuccessful %s", rsp.op_errstr);
+                snprintf (msg, sizeof (msg), "command unsuccessful %s",
+                          rsp.op_errstr);
+
+                if (global_state->mode & GLUSTER_MODE_XML)
+                        goto xml_output;
                 goto out;
         }
 
-        if (rsp.type == GF_QUOTA_OPTION_TYPE_LIST) {
-                if (rsp.limit_list) {
-                        gf_cli3_1_print_limit_list (rsp.volname,
-                                                    rsp.limit_list,
+        if (rsp.dict.dict_len) {
+                /* Unserialize the dictionary */
+                dict  = dict_new ();
+
+                ret = dict_unserialize (rsp.dict.dict_val,
+                                        rsp.dict.dict_len,
+                                        &dict);
+                if (ret < 0) {
+                        gf_log ("glusterd", GF_LOG_ERROR,
+                                "failed to "
+                                "unserialize req-buffer to dictionary");
+                        goto out;
+                }
+        }
+
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                gf_log (frame->this->name, GF_LOG_TRACE,
+                        "failed to get volname");
+
+        ret = dict_get_str (dict, "limit_list", &limit_list);
+        if (ret)
+                gf_log (frame->this->name, GF_LOG_TRACE,
+                        "failed to get limit_list");
+
+        ret = dict_get_int32 (dict, "type", &type);
+        if (ret)
+                gf_log (frame->this->name, GF_LOG_TRACE,
+                        "failed to get type");
+
+        if (type == GF_QUOTA_OPTION_TYPE_LIST) {
+                if (global_state->mode & GLUSTER_MODE_XML) {
+                        ret = cli_xml_output_vol_quota_limit_list
+                                (volname, limit_list, rsp.op_ret,
+                                 rsp.op_errno, rsp.op_errstr);
+                        if (ret)
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Error outputting to xml");
+                        goto out;
+
+                }
+
+                if (limit_list) {
+                        gf_cli_print_limit_list (volname,
+                                                    limit_list,
                                                     rsp.op_errstr);
+                } else {
+                        gf_log ("cli", GF_LOG_INFO, "Received resp to quota "
+                                "command ");
+                        if (rsp.op_errstr)
+                                snprintf (msg, sizeof (msg), "%s",
+                                          rsp.op_errstr);
                 }
         } else {
                 gf_log ("cli", GF_LOG_INFO, "Received resp to quota command ");
                 if (rsp.op_errstr)
-                        cli_out ("%s", rsp.op_errstr);
+                        snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
                 else
-                        cli_out ("%s", "successful");
+                        snprintf (msg, sizeof (msg), "successful");
         }
 
-out:
-        ret = rsp.op_ret;
+xml_output:
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_str ("volQuota", msg, rsp.op_ret,
+                                          rsp.op_errno, rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (strlen (msg) > 0) {
+                if (rsp.op_ret)
+                        cli_err ("%s", msg);
+                else
+                        cli_out ("%s", msg);
+        }
 
+        ret = rsp.op_ret;
+out:
         cli_cmd_broadcast_response (ret);
+        if (dict)
+                dict_unref (dict);
+
+        free (rsp.dict.dict_val);
+
         return ret;
 }
 
 int
-gf_cli3_1_getspec_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_getspec_cbk (struct rpc_req *req, struct iovec *iov,
                        int count, void *myframe)
 {
         gf_getspec_rsp          rsp   = {0,};
-        int                     ret   = 0;
+        int                     ret   = -1;
         char                   *spec  = NULL;
+        call_frame_t           *frame = NULL;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
+        frame = myframe;
+
         ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_getspec_rsp);
-        if (ret < 0 || rsp.op_ret == -1) {
-                gf_log ("", GF_LOG_ERROR, "error");
+        if (ret < 0) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
+                goto out;
+        }
+
+        if (rsp.op_ret == -1) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "getspec failed");
                 goto out;
         }
 
@@ -1642,20 +2663,30 @@ out:
 }
 
 int
-gf_cli3_1_pmap_b2p_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_pmap_b2p_cbk (struct rpc_req *req, struct iovec *iov,
                         int count, void *myframe)
 {
         pmap_port_by_brick_rsp rsp = {0,};
-        int                     ret   = 0;
+        int                     ret   = -1;
         char                   *spec  = NULL;
+        call_frame_t           *frame = NULL;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
+        frame = myframe;
+
         ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_pmap_port_by_brick_rsp);
-        if (ret < 0 || rsp.op_ret == -1) {
-                gf_log ("", GF_LOG_ERROR, "error");
+        if (ret < 0) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
+                goto out;
+        }
+
+        if (rsp.op_ret == -1) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "pump_b2p failed");
                 goto out;
         }
 
@@ -1673,13 +2704,12 @@ out:
 
 
 int32_t
-gf_cli3_1_probe (call_frame_t *frame, xlator_t *this,
+gf_cli_probe (call_frame_t *frame, xlator_t *this,
                  void *data)
 {
-        gf1_cli_probe_req  req      = {0,};
+        gf_cli_req         req      = {{0,},};
         int                ret      = 0;
         dict_t            *dict     = NULL;
-        char              *hostname = NULL;
         int                port     = 0;
 
         if (!frame || !this ||  !data) {
@@ -1688,36 +2718,34 @@ gf_cli3_1_probe (call_frame_t *frame, xlator_t *this,
         }
 
         dict = data;
-        ret = dict_get_str (dict, "hostname", &hostname);
-        if (ret)
-                goto out;
 
         ret = dict_get_int32 (dict, "port", &port);
-        if (ret)
-                port = CLI_GLUSTERD_PORT;
-
-        req.hostname = hostname;
-        req.port     = port;
+        if (ret) {
+                ret = dict_set_int32 (dict, "port", CLI_GLUSTERD_PORT);
+                if (ret)
+                        goto out;
+        }
 
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_PROBE, NULL,
-                              this, gf_cli3_1_probe_cbk,
-                              (xdrproc_t)xdr_gf1_cli_probe_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_probe_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_PROBE, this, cli_rpc_prog, NULL);
 
 out:
+        GF_FREE (req.dict.dict_val);
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+
         return ret;
 }
 
 int32_t
-gf_cli3_1_deprobe (call_frame_t *frame, xlator_t *this,
+gf_cli_deprobe (call_frame_t *frame, xlator_t *this,
                    void *data)
 {
-        gf1_cli_deprobe_req  req      = {0,};
+        gf_cli_req           req      = {{0,},};
         int                  ret      = 0;
         dict_t              *dict     = NULL;
-        char                *hostname = NULL;
         int                  port     = 0;
+        int                  flags    = 0;
 
         if (!frame || !this ||  !data) {
                 ret = -1;
@@ -1725,53 +2753,70 @@ gf_cli3_1_deprobe (call_frame_t *frame, xlator_t *this,
         }
 
         dict = data;
-        ret = dict_get_str (dict, "hostname", &hostname);
-        if (ret)
-                goto out;
-
         ret = dict_get_int32 (dict, "port", &port);
-        if (ret)
-                port = CLI_GLUSTERD_PORT;
+        if (ret) {
+                ret = dict_set_int32 (dict, "port", CLI_GLUSTERD_PORT);
+                if (ret)
+                        goto out;
+        }
 
-        req.hostname = hostname;
-        req.port     = port;
+        ret = dict_get_int32 (dict, "flags", &flags);
+        if (ret) {
+                ret = dict_set_int32 (dict, "flags", 0);
+                if (ret)
+                        goto out;
+        }
 
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_DEPROBE, NULL,
-                              this, gf_cli3_1_deprobe_cbk,
-                              (xdrproc_t)xdr_gf1_cli_deprobe_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_deprobe_cbk,
+                               (xdrproc_t)xdr_gf_cli_req, dict,
+                              GLUSTER_CLI_DEPROBE, this, cli_rpc_prog, NULL);
 
 out:
+        GF_FREE (req.dict.dict_val);
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+
         return ret;
 }
 
 int32_t
-gf_cli3_1_list_friends (call_frame_t *frame, xlator_t *this,
-                        void *data)
+gf_cli_list_friends (call_frame_t *frame, xlator_t *this,
+                     void *data)
 {
         gf1_cli_peer_list_req   req = {0,};
         int                     ret = 0;
+        unsigned long           flags = 0;
 
         if (!frame || !this) {
                 ret = -1;
                 goto out;
         }
 
-        req.flags = GF_CLI_LIST_ALL;
+        GF_ASSERT (frame->local == NULL);
 
+        flags = (long)data;
+        req.flags = flags;
+        frame->local = (void*)flags;
         ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
                               GLUSTER_CLI_LIST_FRIENDS, NULL,
-                              this, gf_cli3_1_list_friends_cbk,
+                              this, gf_cli_list_friends_cbk,
                               (xdrproc_t) xdr_gf1_cli_peer_list_req);
 
 out:
+        if (ret) {
+                /*
+                 * If everything goes fine, gf_cli_list_friends_cbk()
+                 * [invoked through cli_cmd_submit()]resets the
+                 * frame->local to NULL. In case cli_cmd_submit()
+                 * fails in between, RESET frame->local here.
+                 */
+                frame->local = NULL;
+        }
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
         return ret;
 }
 
 int32_t
-gf_cli3_1_get_next_volume (call_frame_t *frame, xlator_t *this,
+gf_cli_get_next_volume (call_frame_t *frame, xlator_t *this,
                            void *data)
 {
 
@@ -1785,37 +2830,58 @@ gf_cli3_1_get_next_volume (call_frame_t *frame, xlator_t *this,
         }
 
         ctx = data;
+        local = frame->local;
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_vol_info_begin (local, 0, 0, "");
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Error outputting to xml");
+                        goto out;
+                }
+        }
 
-        ret = gf_cli3_1_get_volume (frame, this, data);
+        ret = gf_cli_get_volume (frame, this, data);
 
-        local = frame->local;
 
-        if (!local || !local->u.get_vol.volname) {
-                cli_out ("No volumes present");
+        if (!local || !local->get_vol.volname) {
+                if ((global_state->mode & GLUSTER_MODE_XML))
+                        goto end_xml;
+
+                cli_err ("No volumes present");
                 goto out;
         }
 
-        ctx->volname = local->u.get_vol.volname;
+
+        ctx->volname = local->get_vol.volname;
 
         while (ctx->volname) {
-                ret = gf_cli3_1_get_volume (frame, this, ctx);
+                ret = gf_cli_get_volume (frame, this, ctx);
                 if (ret)
                         goto out;
-                ctx->volname = local->u.get_vol.volname;
+                ctx->volname = local->get_vol.volname;
+        }
+
+end_xml:
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_vol_info_end (local);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR, "Error outputting to xml");
         }
 
 out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
         return ret;
 }
 
 int32_t
-gf_cli3_1_get_volume (call_frame_t *frame, xlator_t *this,
+gf_cli_get_volume (call_frame_t *frame, xlator_t *this,
                       void *data)
 {
-        gf1_cli_get_vol_req             req = {0,};
+        gf_cli_req                      req = {{0,}};
         int                             ret = 0;
         cli_cmd_volume_get_ctx_t        *ctx = NULL;
         dict_t                          *dict = NULL;
+        int32_t                         flags = 0;
 
         if (!frame || !this || !data) {
                 ret = -1;
@@ -1823,7 +2889,6 @@ gf_cli3_1_get_volume (call_frame_t *frame, xlator_t *this,
         }
 
         ctx = data;
-        req.flags = ctx->flags;
 
         dict = dict_new ();
         if (!dict)
@@ -1835,146 +2900,151 @@ gf_cli3_1_get_volume (call_frame_t *frame, xlator_t *this,
                         goto out;
         }
 
-        ret = dict_allocate_and_serialize (dict,
-                                           &req.dict.dict_val,
-                                           (size_t *)&req.dict.dict_len);
+        flags = ctx->flags;
+        ret = dict_set_int32 (dict, "flags", flags);
+        if (ret) {
+                gf_log (frame->this->name, GF_LOG_ERROR, "failed to set flags");
+                goto out;
+        }
+
+        ret = dict_allocate_and_serialize (dict, &req.dict.dict_val,
+                                           &req.dict.dict_len);
 
         ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
                               GLUSTER_CLI_GET_VOLUME, NULL,
-                              this, gf_cli3_1_get_volume_cbk,
-                              (xdrproc_t) xdr_gf1_cli_get_vol_req);
+                              this, gf_cli_get_volume_cbk,
+                              (xdrproc_t) xdr_gf_cli_req);
 
 out:
+        if (dict)
+                dict_unref (dict);
+
+        GF_FREE (req.dict.dict_val);
+
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
         return ret;
 }
 
-
 int32_t
-gf_cli3_1_create_volume (call_frame_t *frame, xlator_t *this,
-                         void *data)
+gf_cli3_1_uuid_get (call_frame_t *frame, xlator_t *this,
+                      void *data)
 {
-        gf1_cli_create_vol_req  req = {0,};
-        int                     ret = 0;
-        dict_t                  *dict = NULL;
-        cli_local_t             *local = NULL;
+        gf_cli_req                      req = {{0,}};
+        int                             ret = 0;
+        dict_t                          *dict = NULL;
 
-        if (!frame || !this ||  !data) {
+        if (!frame || !this || !data) {
                 ret = -1;
                 goto out;
         }
 
-        dict = dict_ref ((dict_t *)data);
+        dict = data;
+        ret = cli_to_glusterd (&req, frame, gf_cli3_1_uuid_get_cbk,
+                               (xdrproc_t)xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_UUID_GET, this, cli_rpc_prog,
+                               NULL);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
 
-        ret = dict_get_str (dict, "volname", &req.volname);
+int32_t
+gf_cli3_1_uuid_reset (call_frame_t *frame, xlator_t *this,
+                      void *data)
+{
+        gf_cli_req                      req = {{0,}};
+        int                             ret = 0;
+        dict_t                          *dict = NULL;
 
-        if (ret)
+        if (!frame || !this || !data) {
+                ret = -1;
                 goto out;
+        }
 
-        ret = dict_get_int32 (dict, "type", (int32_t *)&req.type);
-
-        if (ret)
-                goto out;
+        dict = data;
+        ret = cli_to_glusterd (&req, frame, gf_cli3_1_uuid_reset_cbk,
+                               (xdrproc_t)xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_UUID_RESET, this, cli_rpc_prog,
+                               NULL);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
 
-        ret = dict_get_int32 (dict, "count", &req.count);
-        if (ret)
-                goto out;
+int32_t
+gf_cli_create_volume (call_frame_t *frame, xlator_t *this,
+                         void *data)
+{
+        gf_cli_req              req = {{0,}};
+        int                     ret = 0;
+        dict_t                  *dict = NULL;
 
-        ret = dict_allocate_and_serialize (dict,
-                                           &req.bricks.bricks_val,
-                                           (size_t *)&req.bricks.bricks_len);
-        if (ret < 0) {
-                gf_log (this->name, GF_LOG_DEBUG,
-                        "failed to get serialized length of dict");
+        if (!frame || !this ||  !data) {
+                ret = -1;
                 goto out;
         }
 
-        local = cli_local_get ();
-
-        if (local) {
-                local->u.create_vol.dict = dict_ref (dict);
-                frame->local = local;
-        }
-
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_CREATE_VOLUME, NULL,
-                              this, gf_cli3_1_create_volume_cbk,
-                              (xdrproc_t) xdr_gf1_cli_create_vol_req);
-
+        dict = data;
 
+        ret = cli_to_glusterd (&req, frame, gf_cli_create_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_CREATE_VOLUME, this, cli_rpc_prog,
+                               NULL);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
 
-        if (dict)
-                dict_unref (dict);
-
-        if (req.bricks.bricks_val) {
-                GF_FREE (req.bricks.bricks_val);
-        }
+        GF_FREE (req.dict.dict_val);
 
         return ret;
 }
 
 int32_t
-gf_cli3_1_delete_volume (call_frame_t *frame, xlator_t *this,
+gf_cli_delete_volume (call_frame_t *frame, xlator_t *this,
                          void *data)
 {
-        gf1_cli_delete_vol_req  req = {0,};
+        gf_cli_req              req = {{0,}};
         int                     ret = 0;
-        cli_local_t             *local = NULL;
+        dict_t                  *dict = NULL;
 
         if (!frame || !this ||  !data) {
                 ret = -1;
                 goto out;
         }
 
-        local = cli_local_get ();
-
-        if (local) {
-                local->u.delete_vol.volname = data;
-                frame->local = local;
-        }
-
-        req.volname = data;
+        dict = data;
 
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_DELETE_VOLUME, NULL,
-                              this, gf_cli3_1_delete_volume_cbk,
-                              (xdrproc_t)xdr_gf1_cli_delete_vol_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_delete_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_DELETE_VOLUME, this, cli_rpc_prog,
+                               NULL);
 
 out:
+        GF_FREE (req.dict.dict_val);
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
 
         return ret;
 }
 
 int32_t
-gf_cli3_1_start_volume (call_frame_t *frame, xlator_t *this,
+gf_cli_start_volume (call_frame_t *frame, xlator_t *this,
                          void *data)
 {
-        gf1_cli_start_vol_req   *req = NULL;
+        gf_cli_req              req = {{0,}};
         int                     ret = 0;
-        cli_local_t             *local = NULL;
+        dict_t                  *dict = NULL;
 
         if (!frame || !this ||  !data) {
                 ret = -1;
                 goto out;
         }
 
-        req = data;
-        local = cli_local_get ();
-
-        if (local) {
-                local->u.start_vol.volname = req->volname;
-                local->u.start_vol.flags = req->flags;
-                frame->local = local;
-        }
+        dict = data;
 
-        ret = cli_cmd_submit (req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_START_VOLUME, NULL,
-                              this, gf_cli3_1_start_volume_cbk,
-                              (xdrproc_t) xdr_gf1_cli_start_vol_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_start_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_START_VOLUME, this, cli_rpc_prog,
+                               NULL);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -1983,31 +3053,24 @@ out:
 }
 
 int32_t
-gf_cli3_1_stop_volume (call_frame_t *frame, xlator_t *this,
+gf_cli_stop_volume (call_frame_t *frame, xlator_t *this,
                          void *data)
 {
-        gf1_cli_stop_vol_req   req = {0,};
+        gf_cli_req             req = {{0,}};
         int                    ret = 0;
-        cli_local_t            *local = NULL;
+        dict_t                 *dict = data;
 
         if (!frame || !this ||  !data) {
                 ret = -1;
                 goto out;
         }
 
-        req = *((gf1_cli_stop_vol_req*)data);
-        local = cli_local_get ();
-
-        if (local) {
-                local->u.stop_vol.volname = req.volname;
-                local->u.stop_vol.flags = req.flags;
-                frame->local = local;
-        }
+        dict = data;
 
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_STOP_VOLUME, NULL,
-                              this, gf_cli3_1_stop_volume_cbk,
-                              (xdrproc_t) xdr_gf1_cli_stop_vol_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_stop_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_STOP_VOLUME, this, cli_rpc_prog,
+                               NULL);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -2016,14 +3079,11 @@ out:
 }
 
 int32_t
-gf_cli3_1_defrag_volume (call_frame_t *frame, xlator_t *this,
+gf_cli_defrag_volume (call_frame_t *frame, xlator_t *this,
                          void *data)
 {
-        gf1_cli_defrag_vol_req  req     = {0,};
+        gf_cli_req              req     =  {{0,}};
         int                     ret     = 0;
-        cli_local_t            *local   = NULL;
-        char                   *volname = NULL;
-        char                   *cmd_str = NULL;
         dict_t                 *dict    = NULL;
 
         if (!frame || !this ||  !data) {
@@ -2033,55 +3093,10 @@ gf_cli3_1_defrag_volume (call_frame_t *frame, xlator_t *this,
 
         dict = data;
 
-        ret = dict_get_str (dict, "volname", &volname);
-        if (ret)
-                gf_log ("", GF_LOG_DEBUG, "error");
-
-        ret = dict_get_str (dict, "command", &cmd_str);
-        if (ret) {
-                gf_log ("", GF_LOG_DEBUG, "error");
-                goto out;
-        }
-
-        if (strcmp (cmd_str, "start") == 0) {
-                req.cmd = GF_DEFRAG_CMD_START;
-                ret = dict_get_str (dict, "start-type", &cmd_str);
-                if (!ret) {
-                        if (strcmp (cmd_str, "fix-layout") == 0) {
-                                req.cmd = GF_DEFRAG_CMD_START_LAYOUT_FIX;
-                        }
-                        if (strcmp (cmd_str, "migrate-data") == 0) {
-                                req.cmd = GF_DEFRAG_CMD_START_MIGRATE_DATA;
-                        }
-                        if (strcmp (cmd_str, "migrate-data-force") == 0) {
-                                req.cmd = GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE;
-                        }
-                }
-                goto done;
-        }
-        if (strcmp (cmd_str, "stop") == 0) {
-                req.cmd = GF_DEFRAG_CMD_STOP;
-                goto done;
-        }
-        if (strcmp (cmd_str, "status") == 0) {
-                req.cmd = GF_DEFRAG_CMD_STATUS;
-        }
-
-done:
-        local = cli_local_get ();
-
-        if (local) {
-                local->u.defrag_vol.volname = gf_strdup (volname);
-                local->u.defrag_vol.cmd = req.cmd;
-                frame->local = local;
-        }
-
-        req.volname = volname;
-
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_DEFRAG_VOLUME, NULL,
-                              this, gf_cli3_1_defrag_volume_cbk,
-                              (xdrproc_t) xdr_gf1_cli_defrag_vol_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_defrag_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_DEFRAG_VOLUME, this, cli_rpc_prog,
+                               NULL);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -2090,10 +3105,10 @@ out:
 }
 
 int32_t
-gf_cli3_1_rename_volume (call_frame_t *frame, xlator_t *this,
+gf_cli_rename_volume (call_frame_t *frame, xlator_t *this,
                          void *data)
 {
-        gf1_cli_rename_vol_req  req = {0,};
+        gf_cli_req              req = {{0,}};
         int                     ret = 0;
         dict_t                  *dict = NULL;
 
@@ -2104,20 +3119,20 @@ gf_cli3_1_rename_volume (call_frame_t *frame, xlator_t *this,
 
         dict = data;
 
-        ret = dict_get_str (dict, "old-volname", &req.old_volname);
+        ret = dict_allocate_and_serialize (dict, &req.dict.dict_val,
+                                           &req.dict.dict_len);
+        if (ret < 0) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to serialize the data");
 
-        if (ret)
                 goto out;
+        }
 
-        ret = dict_get_str (dict, "new-volname", &req.new_volname);
-
-        if (ret)
-                goto out;
 
         ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
                               GLUSTER_CLI_RENAME_VOLUME, NULL,
-                              this, gf_cli3_1_rename_volume_cbk,
-                              (xdrproc_t) xdr_gf1_cli_rename_vol_req);
+                              this, gf_cli_rename_volume_cbk,
+                              (xdrproc_t) xdr_gf_cli_req);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -2126,10 +3141,10 @@ out:
 }
 
 int32_t
-gf_cli3_1_reset_volume (call_frame_t *frame, xlator_t *this, 
+gf_cli_reset_volume (call_frame_t *frame, xlator_t *this,
                         void *data)
 {
-        gf1_cli_reset_vol_req     req = {0,};
+        gf_cli_req              req =  {{0,}};
         int                     ret = 0;
         dict_t                  *dict = NULL;
 
@@ -2140,36 +3155,21 @@ gf_cli3_1_reset_volume (call_frame_t *frame, xlator_t *this,
 
         dict = data;
 
-        ret = dict_get_str (dict, "volname", &req.volname);
-
-        if (ret)
-                goto out;
-
-        ret = dict_allocate_and_serialize (dict,
-                                           &req.dict.dict_val,
-                                           (size_t *)&req.dict.dict_len);
-        if (ret < 0) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "failed to get serialized length of dict");
-                goto out;
-        }
-
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                               GLUSTER_CLI_RESET_VOLUME, NULL,
-                               this, gf_cli3_1_reset_volume_cbk,
-                               (xdrproc_t) xdr_gf1_cli_reset_vol_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_reset_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_RESET_VOLUME, this, cli_rpc_prog,
+                               NULL);
 
 out:
-                gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
-
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
         return ret;
 }
 
 int32_t
-gf_cli3_1_set_volume (call_frame_t *frame, xlator_t *this,
+gf_cli_set_volume (call_frame_t *frame, xlator_t *this,
                          void *data)
 {
-        gf1_cli_set_vol_req     req = {0,};
+        gf_cli_req              req =  {{0,}};
         int                     ret = 0;
         dict_t                  *dict = NULL;
 
@@ -2180,24 +3180,10 @@ gf_cli3_1_set_volume (call_frame_t *frame, xlator_t *this,
 
         dict = data;
 
-        ret = dict_get_str (dict, "volname", &req.volname);
-
-        if (ret)
-                goto out;
-
-        ret = dict_allocate_and_serialize (dict,
-                                           &req.dict.dict_val,
-                                           (size_t *)&req.dict.dict_len);
-        if (ret < 0) {
-                gf_log (this->name, GF_LOG_DEBUG,
-                        "failed to get serialized length of dict");
-                goto out;
-        }
-
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_SET_VOLUME, NULL,
-                              this, gf_cli3_1_set_volume_cbk,
-                              (xdrproc_t) xdr_gf1_cli_set_vol_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_set_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_SET_VOLUME, this, cli_rpc_prog,
+                               NULL);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
@@ -2206,12 +3192,14 @@ out:
 }
 
 int32_t
-gf_cli3_1_add_brick (call_frame_t *frame, xlator_t *this,
+gf_cli_add_brick (call_frame_t *frame, xlator_t *this,
                          void *data)
 {
-        gf1_cli_add_brick_req  req = {0,};
+        gf_cli_req              req =  {{0,}};
         int                     ret = 0;
         dict_t                  *dict = NULL;
+        char                    *volname = NULL;
+        int32_t                 count = 0;
 
         if (!frame || !this ||  !data) {
                 ret = -1;
@@ -2220,50 +3208,39 @@ gf_cli3_1_add_brick (call_frame_t *frame, xlator_t *this,
 
         dict = data;
 
-        ret = dict_get_str (dict, "volname", &req.volname);
+        ret = dict_get_str (dict, "volname", &volname);
 
         if (ret)
                 goto out;
 
-        ret = dict_get_int32 (dict, "count", &req.count);
+        ret = dict_get_int32 (dict, "count", &count);
         if (ret)
                 goto out;
 
-
-        ret = dict_allocate_and_serialize (dict,
-                                           &req.bricks.bricks_val,
-                                           (size_t *)&req.bricks.bricks_len);
-        if (ret < 0) {
-                gf_log (this->name, GF_LOG_DEBUG,
-                        "failed to get serialized length of dict");
-                goto out;
-        }
-
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_ADD_BRICK, NULL,
-                              this, gf_cli3_1_add_brick_cbk,
-                              (xdrproc_t) xdr_gf1_cli_add_brick_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_add_brick_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_ADD_BRICK, this, cli_rpc_prog, NULL);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
 
-        if (req.bricks.bricks_val) {
-                GF_FREE (req.bricks.bricks_val);
-        }
+        GF_FREE (req.dict.dict_val);
 
         return ret;
 }
 
 int32_t
-gf_cli3_1_remove_brick (call_frame_t *frame, xlator_t *this,
+gf_cli_remove_brick (call_frame_t *frame, xlator_t *this,
                          void *data)
 {
-        gf1_cli_remove_brick_req  req = {0,};
-        gf1_cli_defrag_vol_req    status_req = {0,};
+        gf_cli_req                req =  {{0,}};;
+        gf_cli_req                status_req =  {{0,}};;
         int                       ret = 0;
         dict_t                   *dict = NULL;
         int32_t                   command = 0;
         char                     *volname = NULL;
+        dict_t                   *req_dict = NULL;
+        int32_t                   cmd = 0;
 
         if (!frame || !this ||  !data) {
                 ret = -1;
@@ -2276,102 +3253,97 @@ gf_cli3_1_remove_brick (call_frame_t *frame, xlator_t *this,
         if (ret)
                 goto out;
 
-        ret = dict_get_int32 (dict, "count", &req.count);
-        if (ret)
-                goto out;
-
         ret = dict_get_int32 (dict, "command", &command);
         if (ret)
                 goto out;
 
-        if (command != GF_OP_CMD_STATUS) {
-                req.volname = volname;
+        if ((command != GF_OP_CMD_STATUS) &&
+            (command != GF_OP_CMD_STOP)) {
 
-                ret = dict_allocate_and_serialize (dict,
-                                                   &req.bricks.bricks_val,
-                                                   (size_t *)&req.bricks.bricks_len);
-                if (ret < 0) {
-                        gf_log (this->name, GF_LOG_DEBUG,
-                                "failed to get serialized length of dict");
+
+                ret = cli_to_glusterd (&req, frame, gf_cli_remove_brick_cbk,
+                                       (xdrproc_t) xdr_gf_cli_req, dict,
+                                       GLUSTER_CLI_REMOVE_BRICK, this,
+                                       cli_rpc_prog, NULL);
+        } else {
+                /* Need rebalance status to be sent :-) */
+                req_dict = dict_new ();
+                if (!req_dict) {
+                        ret = -1;
                         goto out;
                 }
 
-                ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                                      GLUSTER_CLI_REMOVE_BRICK, NULL,
-                                      this, gf_cli3_1_remove_brick_cbk,
-                                      (xdrproc_t) xdr_gf1_cli_remove_brick_req);
-        } else {
-                /* Need rebalance status to e sent :-) */
-                status_req.volname = volname;
-                status_req.cmd = GF_DEFRAG_CMD_STATUS;
+                ret = dict_set_str (req_dict, "volname", volname);
+                if (ret) {
+                        gf_log (this->name, GF_LOG_ERROR,
+                                "Failed to set dict");
+                        goto out;
+                }
 
-                ret = cli_cmd_submit (&status_req, frame, cli_rpc_prog,
-                                      GLUSTER_CLI_DEFRAG_VOLUME, NULL,
-                                      this, gf_cli3_remove_brick_status_cbk,
-                                      (xdrproc_t) xdr_gf1_cli_defrag_vol_req);
+                if (command == GF_OP_CMD_STATUS)
+                        cmd |= GF_DEFRAG_CMD_STATUS;
+                else
+                        cmd |= GF_DEFRAG_CMD_STOP;
 
-        }
+                ret = dict_set_int32 (req_dict, "rebalance-command", (int32_t) cmd);
+                if (ret) {
+                        gf_log (this->name, GF_LOG_ERROR,
+                                "Failed to set dict");
+                        goto out;
+                }
+
+                ret = cli_to_glusterd (&status_req, frame,
+                                       gf_cli3_remove_brick_status_cbk,
+                                       (xdrproc_t) xdr_gf_cli_req, req_dict,
+                                       GLUSTER_CLI_DEFRAG_VOLUME, this,
+                                       cli_rpc_prog, NULL);
+
+                }
 
 out:
+        if (req_dict)
+                dict_unref (req_dict);
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
 
-        if (req.bricks.bricks_val) {
-                GF_FREE (req.bricks.bricks_val);
-        }
+        GF_FREE (req.dict.dict_val);
+
+        GF_FREE (status_req.dict.dict_val);
 
         return ret;
 }
 
 int32_t
-gf_cli3_1_replace_brick (call_frame_t *frame, xlator_t *this,
+gf_cli_replace_brick (call_frame_t *frame, xlator_t *this,
                          void *data)
 {
-        gf1_cli_replace_brick_req   req        = {0,};
+        gf_cli_req                  req        =  {{0,}};
         int                         ret        = 0;
-        cli_local_t                *local      = NULL;
         dict_t                     *dict       = NULL;
         char                       *src_brick  = NULL;
         char                       *dst_brick  = NULL;
+        char                       *volname    = NULL;
+        int32_t                     op         = 0;
 
         if (!frame || !this ||  !data) {
                 ret = -1;
                 goto out;
         }
 
-	dict = data;
-
-        local = cli_local_get ();
-        if (!local) {
-                ret = -1;
-                gf_log (this->name, GF_LOG_ERROR,
-                        "Out of memory");
-                goto out;
-        }
-
-        local->u.replace_brick.dict = dict_ref (dict);
-        frame->local                = local;
+        dict = data;
 
-        ret = dict_get_int32 (dict, "operation", (int32_t *)&req.op);
+        ret = dict_get_int32 (dict, "operation", &op);
         if (ret) {
                 gf_log (this->name, GF_LOG_DEBUG,
                         "dict_get on operation failed");
                 goto out;
         }
-        ret = dict_get_str (dict, "volname", &req.volname);
+        ret = dict_get_str (dict, "volname", &volname);
         if (ret) {
                 gf_log (this->name, GF_LOG_DEBUG,
                         "dict_get on volname failed");
                 goto out;
         }
 
-        local->u.replace_brick.volname = gf_strdup (req.volname);
-        if (!local->u.replace_brick.volname) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "Out of memory");
-                ret = -1;
-                goto out;
-        }
-
         ret = dict_get_str (dict, "src-brick", &src_brick);
         if (ret) {
                 gf_log (this->name, GF_LOG_DEBUG,
@@ -2387,40 +3359,29 @@ gf_cli3_1_replace_brick (call_frame_t *frame, xlator_t *this,
         }
 
         gf_log (this->name, GF_LOG_DEBUG,
-                "Recevied command replace-brick %s with "
+                "Received command replace-brick %s with "
                 "%s with operation=%d", src_brick,
-                dst_brick, req.op);
-
-
-        ret = dict_allocate_and_serialize (dict,
-                                           &req.bricks.bricks_val,
-                                           (size_t *)&req.bricks.bricks_len);
-        if (ret < 0) {
-                gf_log (this->name, GF_LOG_DEBUG,
-                        "failed to get serialized length of dict");
-                goto out;
-        }
+                dst_brick, op);
 
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_REPLACE_BRICK, NULL,
-                              this, gf_cli3_1_replace_brick_cbk,
-                              (xdrproc_t) xdr_gf1_cli_replace_brick_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_replace_brick_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_REPLACE_BRICK, this, cli_rpc_prog,
+                               NULL);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
 
-        if (req.bricks.bricks_val) {
-                GF_FREE (req.bricks.bricks_val);
-        }
+        GF_FREE (req.dict.dict_val);
 
         return ret;
 }
 
+
 int32_t
-gf_cli3_1_log_filename (call_frame_t *frame, xlator_t *this,
-                        void *data)
+gf_cli_log_rotate (call_frame_t *frame, xlator_t *this,
+                      void *data)
 {
-        gf1_cli_log_filename_req  req = {0,};
+        gf_cli_req                req = {{0,}};
         int                       ret = 0;
         dict_t                   *dict = NULL;
 
@@ -2431,67 +3392,25 @@ gf_cli3_1_log_filename (call_frame_t *frame, xlator_t *this,
 
         dict = data;
 
-        ret = dict_get_str (dict, "volname", &req.volname);
-        if (ret)
-                goto out;
-
-        ret = dict_get_str (dict, "brick", &req.brick);
-        if (ret)
-                req.brick = "";
-
-        ret = dict_get_str (dict, "path", &req.path);
-        if (ret)
-                goto out;
-
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_LOG_FILENAME, NULL,
-                              this, gf_cli3_1_log_filename_cbk,
-                              (xdrproc_t) xdr_gf1_cli_log_filename_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_log_rotate_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_LOG_ROTATE, this, cli_rpc_prog,
+                               NULL);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
 
-        return ret;
-}
-
-static int
-gf_cli3_1_log_level_cbk (struct rpc_req *req, struct iovec *iov,
-                         int count, void *myframe)
-{
-        gf1_cli_log_level_rsp rsp = {0,};
-        int                   ret = -1;
-
-        if (req->rpc_status == -1)
-                goto out;
-
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_log_level_rsp);
-        if (ret < 0) {
-                gf_log ("cli", GF_LOG_ERROR, "log level response error");
-                goto out;
-        }
-
-        gf_log ("cli", GF_LOG_DEBUG, "Received response to log level cmd");
-
-        if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
-        else
-                cli_out ("log level set: %s", (rsp.op_ret) ? "unsuccessful" :
-                         "successful");
-
-        ret = rsp.op_ret;
-
- out:
-        cli_cmd_broadcast_response (ret);
+        GF_FREE (req.dict.dict_val);
         return ret;
 }
 
 int32_t
-gf_cli3_1_log_level (call_frame_t *frame, xlator_t *this,
-                     void *data)
+gf_cli_sync_volume (call_frame_t *frame, xlator_t *this,
+                       void *data)
 {
-        gf1_cli_log_level_req  req  = {0,};
-        int                    ret  = 0;
-        dict_t                *dict = NULL;
+        int               ret = 0;
+        gf_cli_req        req = {{0,}};
+        dict_t            *dict = NULL;
 
         if (!frame || !this || !data) {
                 ret = -1;
@@ -2500,36 +3419,26 @@ gf_cli3_1_log_level (call_frame_t *frame, xlator_t *this,
 
         dict = data;
 
-        ret = dict_get_str (dict, "volname", &req.volname);
-        if (ret)
-                goto out;
-
-        ret = dict_get_str (dict, "xlator", &req.xlator);
-        if (ret)
-                goto out;
-
-        ret = dict_get_str (dict, "loglevel", &req.loglevel);
-        if (ret)
-                goto out;
+        ret = cli_to_glusterd (&req, frame, gf_cli_sync_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_SYNC_VOLUME, this, cli_rpc_prog,
+                               NULL);
 
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_LOG_LEVEL, NULL,
-                              this, gf_cli3_1_log_level_cbk,
-                              (xdrproc_t) xdr_gf1_cli_log_level_req);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        GF_FREE (req.dict.dict_val);
 
- out:
-        gf_log ("cli", GF_LOG_DEBUG, "Returning: %d", ret);
         return ret;
 }
 
-
 int32_t
-gf_cli3_1_log_locate (call_frame_t *frame, xlator_t *this,
-                      void *data)
+gf_cli_getspec (call_frame_t *frame, xlator_t *this,
+                         void *data)
 {
-        gf1_cli_log_locate_req  req = {0,};
+        gf_getspec_req          req = {0,};
         int                     ret = 0;
-        dict_t                 *dict = NULL;
+        dict_t                  *dict = NULL;
+        dict_t                  *op_dict = NULL;
 
         if (!frame || !this ||  !data) {
                 ret = -1;
@@ -2538,117 +3447,59 @@ gf_cli3_1_log_locate (call_frame_t *frame, xlator_t *this,
 
         dict = data;
 
-        ret = dict_get_str (dict, "volname", &req.volname);
+        ret = dict_get_str (dict, "volid", &req.key);
         if (ret)
                 goto out;
 
-        ret = dict_get_str (dict, "brick", &req.brick);
-        if (ret)
-                req.brick = "";
-
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_LOG_LOCATE, NULL,
-                              this, gf_cli3_1_log_locate_cbk,
-                              (xdrproc_t) xdr_gf1_cli_log_locate_req);
-
-out:
-        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
-
-        return ret;
-}
-
-int32_t
-gf_cli3_1_log_rotate (call_frame_t *frame, xlator_t *this,
-                      void *data)
-{
-        gf1_cli_log_locate_req  req = {0,};
-        int                       ret = 0;
-        dict_t                   *dict = NULL;
-
-        if (!frame || !this ||  !data) {
+        op_dict = dict_new ();
+        if (!dict) {
                 ret = -1;
                 goto out;
         }
 
-        dict = data;
-
-        ret = dict_get_str (dict, "volname", &req.volname);
-        if (ret)
-                goto out;
-
-        ret = dict_get_str (dict, "brick", &req.brick);
-        if (ret)
-                req.brick = "";
-
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_LOG_ROTATE, NULL,
-                              this, gf_cli3_1_log_rotate_cbk,
-                              (xdrproc_t) xdr_gf1_cli_log_rotate_req);
-
-
-out:
-        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
-
-        return ret;
-}
-
-int32_t
-gf_cli3_1_sync_volume (call_frame_t *frame, xlator_t *this,
-                       void *data)
-{
-        int               ret = 0;
-
-        if (!frame || !this || !data) {
-                ret = -1;
+        // Set the supported min and max op-versions, so glusterd can make a
+        // decision
+        ret = dict_set_int32 (op_dict, "min-op-version", GD_OP_VERSION_MIN);
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_ERROR, "Failed to set min-op-version"
+                        " in request dict");
                 goto out;
         }
 
-        ret = cli_cmd_submit ((gf1_cli_sync_volume_req*)data, frame,
-                              cli_rpc_prog, GLUSTER_CLI_SYNC_VOLUME,
-                              NULL, this, gf_cli3_1_sync_volume_cbk,
-                              (xdrproc_t) xdr_gf1_cli_sync_volume_req);
-
-out:
-        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
-
-        return ret;
-}
-
-int32_t
-gf_cli3_1_getspec (call_frame_t *frame, xlator_t *this,
-                         void *data)
-{
-        gf_getspec_req          req = {0,};
-        int                     ret = 0;
-        dict_t                  *dict = NULL;
-
-        if (!frame || !this ||  !data) {
-                ret = -1;
+        ret = dict_set_int32 (op_dict, "max-op-version", GD_OP_VERSION_MAX);
+        if (ret) {
+                gf_log (THIS->name, GF_LOG_ERROR, "Failed to set max-op-version"
+                        " in request dict");
                 goto out;
         }
 
-        dict = data;
-
-        ret = dict_get_str (dict, "volid", &req.key);
-        if (ret)
+        ret = dict_allocate_and_serialize (op_dict, &req.xdata.xdata_val,
+                                           &req.xdata.xdata_len);
+        if (ret < 0) {
+                gf_log (THIS->name, GF_LOG_ERROR,
+                        "Failed to serialize dictionary");
                 goto out;
+        }
 
         ret = cli_cmd_submit (&req, frame, &cli_handshake_prog,
                               GF_HNDSK_GETSPEC, NULL,
-                              this, gf_cli3_1_getspec_cbk,
+                              this, gf_cli_getspec_cbk,
                               (xdrproc_t) xdr_gf_getspec_req);
 
 out:
+        if (op_dict) {
+                dict_unref(op_dict);
+        }
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
 
         return ret;
 }
 
 int32_t
-gf_cli3_1_quota (call_frame_t *frame, xlator_t *this,
+gf_cli_quota (call_frame_t *frame, xlator_t *this,
                  void *data)
 {
-        gf1_cli_quota_req   req = {0,};
+        gf_cli_req          req = {{0,}};
         int                 ret = 0;
         dict_t             *dict = NULL;
 
@@ -2659,32 +3510,18 @@ gf_cli3_1_quota (call_frame_t *frame, xlator_t *this,
 
         dict = data;
 
-        ret = dict_get_str (dict, "volname", &req.volname);
+        ret = cli_to_glusterd (&req, frame, gf_cli_quota_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_QUOTA, this, cli_rpc_prog, NULL);
 
-        if (ret)
-                goto out;
-
-        ret = dict_allocate_and_serialize (dict,
-                                           &req.dict.dict_val,
-                                           (size_t *)&req.dict.dict_len);
-        if (ret < 0) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "failed to get serialized length of dict");
-                goto out;
-        }
-
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_QUOTA, NULL,
-                              this, gf_cli3_1_quota_cbk,
-                              (xdrproc_t) xdr_gf1_cli_quota_req);
-
-        GF_FREE (req.dict.dict_val);
 out:
+        GF_FREE (req.dict.dict_val);
+
         return ret;
 }
 
 int32_t
-gf_cli3_1_pmap_b2p (call_frame_t *frame, xlator_t *this, void *data)
+gf_cli_pmap_b2p (call_frame_t *frame, xlator_t *this, void *data)
 {
         pmap_port_by_brick_req  req = {0,};
         int                     ret = 0;
@@ -2703,7 +3540,7 @@ gf_cli3_1_pmap_b2p (call_frame_t *frame, xlator_t *this, void *data)
 
         ret = cli_cmd_submit (&req, frame, &cli_pmap_prog,
                               GF_PMAP_PORTBYBRICK, NULL,
-                              this, gf_cli3_1_pmap_b2p_cbk,
+                              this, gf_cli_pmap_b2p_cbk,
                               (xdrproc_t) xdr_pmap_port_by_brick_req);
 
 out:
@@ -2713,7 +3550,7 @@ out:
 }
 
 static int
-gf_cli3_1_fsm_log_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_fsm_log_cbk (struct rpc_req *req, struct iovec *iov,
                        int count, void *myframe)
 {
         gf1_cli_fsm_log_rsp        rsp   = {0,};
@@ -2733,14 +3570,15 @@ gf_cli3_1_fsm_log_cbk (struct rpc_req *req, struct iovec *iov,
 
         ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_fsm_log_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
         if (rsp.op_ret) {
                 if (strcmp (rsp.op_errstr, ""))
-                        cli_out ("%s", rsp.op_errstr);
-                cli_out ("fsm log unsuccessful");
+                        cli_err ("%s", rsp.op_errstr);
+                cli_err ("fsm log unsuccessful");
                 ret = rsp.op_ret;
                 goto out;
         }
@@ -2756,7 +3594,7 @@ gf_cli3_1_fsm_log_cbk (struct rpc_req *req, struct iovec *iov,
                                 &dict);
 
         if (ret) {
-                cli_out ("bad response");
+                cli_err ("bad response");
                 goto out;
         }
 
@@ -2764,7 +3602,7 @@ gf_cli3_1_fsm_log_cbk (struct rpc_req *req, struct iovec *iov,
         if (tr_count)
                 cli_out("number of transitions: %d", tr_count);
         else
-                cli_out("No transitions");
+                cli_err("No transitions");
         for (i = 0; i < tr_count; i++) {
                 memset (key, 0, sizeof (key));
                 snprintf (key, sizeof (key), "log%d-old-state", i);
@@ -2803,7 +3641,7 @@ out:
 }
 
 int32_t
-gf_cli3_1_fsm_log (call_frame_t *frame, xlator_t *this, void *data)
+gf_cli_fsm_log (call_frame_t *frame, xlator_t *this, void *data)
 {
         int                        ret = -1;
         gf1_cli_fsm_log_req        req = {0,};
@@ -2817,7 +3655,7 @@ gf_cli3_1_fsm_log (call_frame_t *frame, xlator_t *this, void *data)
         req.name = data;
         ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
                               GLUSTER_CLI_FSM_LOG, NULL,
-                              this, gf_cli3_1_fsm_log_cbk,
+                              this, gf_cli_fsm_log_cbk,
                               (xdrproc_t) xdr_gf1_cli_fsm_log_req);
 
 out:
@@ -2827,14 +3665,17 @@ out:
 }
 
 int
-gf_cli3_1_gsync_config_command (dict_t *dict)
+gf_cli_gsync_config_command (dict_t *dict)
 {
         runner_t runner     = {0,};
         char *subop         = NULL;
         char *gwd           = NULL;
         char *slave         = NULL;
+        char *confpath      = NULL;
         char *master        = NULL;
         char *op_name       = NULL;
+        int   ret           = -1;
+        char  conf_path[PATH_MAX] = "";
 
         if (dict_get_str (dict, "subop", &subop) != 0)
                 return -1;
@@ -2853,9 +3694,17 @@ gf_cli3_1_gsync_config_command (dict_t *dict)
         if (dict_get_str (dict, "op_name", &op_name) != 0)
                 op_name = NULL;
 
+        ret = dict_get_str (dict, "conf_path", &confpath);
+        if (!confpath) {
+                ret = snprintf (conf_path, sizeof(conf_path) - 1,
+                                "%s/"GEOREP"/gsyncd_template.conf", gwd);
+                conf_path[ret] = '\0';
+                confpath = conf_path;
+        }
+
         runinit (&runner);
         runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
-        runner_argprintf (&runner, "%s/"GSYNC_CONF, gwd);
+        runner_argprintf (&runner, "%s", confpath);
         if (master)
                 runner_argprintf (&runner, ":%s", master);
         runner_add_arg (&runner, slave);
@@ -2867,82 +3716,667 @@ gf_cli3_1_gsync_config_command (dict_t *dict)
 }
 
 int
-gf_cli3_1_gsync_out_status (dict_t *dict)
+gf_cli_fetch_gsyncd_status_values (char *status,
+                                   gf_cli_gsync_status_t *sts_val)
 {
-        int              gsync_count = 0;
-        int              i = 0;
-        int              ret = 0;
-        char             mst[PATH_MAX] = {0, };
-        char             slv[PATH_MAX]= {0, };
-        char             sts[PATH_MAX] = {0, };
-        char             hyphens[81] = {0, };
-        char             *mst_val = NULL;
-        char             *slv_val = NULL;
-        char             *sts_val = NULL;
-
-        cli_out ("%-20s %-50s %-10s", "MASTER", "SLAVE", "STATUS");
-
-        for (i=0; i<sizeof(hyphens)-1; i++)
-                hyphens[i] = '-';
+        int32_t   ret      = -1;
+        char     *tmp      = NULL;
+        char     *save_ptr = NULL;
+        char     *key      = NULL;
+        char     *value    = NULL;
 
-        cli_out ("%s", hyphens);
+        if (!status || !sts_val) {
+                gf_log ("", GF_LOG_ERROR, "status or sts_val is null");
+                goto out;
+        }
 
+        tmp = strtok_r (status, "\n", &save_ptr);
 
-        ret = dict_get_int32 (dict, "gsync-count", &gsync_count);
+        if (tmp)
+                sts_val->health = gf_strdup (tmp);
+
+        while (tmp) {
+                key = strtok_r (tmp, "=", &value);
+
+                if ((key) && (!strcmp(key, "Uptime")))
+                        sts_val->uptime = gf_strdup (value);
+
+                if ((key) && (!strcmp(key, "FilesSyncd")))
+                        sts_val->files_syncd = gf_strdup (value);
+
+                if ((key) && (!strcmp(key, "FilesPending")))
+                        sts_val->files_pending = gf_strdup (value);
+
+                if ((key) && (!strcmp(key, "BytesPending"))) {
+                        value = gf_uint64_2human_readable(atol(value));
+                        sts_val->bytes_pending = gf_strdup (value);
+                }
+
+                if ((key) && (!strcmp(key, "DeletesPending")))
+                        sts_val->deletes_pending = gf_strdup (value);
+
+                tmp = strtok_r (NULL, ";", &save_ptr);
+        }
+
+        if (sts_val->health)
+                ret = 0;
+
+        if (!sts_val->uptime)
+                sts_val->uptime = gf_strdup ("N/A");
+
+        if (!sts_val->files_syncd)
+                sts_val->files_syncd = gf_strdup ("N/A");
+
+        if (!sts_val->files_pending)
+                sts_val->files_pending = gf_strdup ("N/A");
+
+        if (!sts_val->bytes_pending)
+                sts_val->bytes_pending = gf_strdup ("N/A");
+
+        if (!sts_val->deletes_pending)
+                sts_val->deletes_pending = gf_strdup ("N/A");
+
+out:
+        gf_log ("", GF_LOG_DEBUG, "Returning %d.", ret);
+        return ret;
+}
+
+char*
+get_struct_variable (int mem_num, gf_cli_gsync_status_t *sts_val)
+{
+        switch (mem_num) {
+        case 0: return (sts_val->node);
+        case 1: return (sts_val->master);
+        case 2: return (sts_val->slave);
+        case 3: return (sts_val->health);
+        case 4: return (sts_val->uptime);
+        case 5: return (sts_val->files_syncd);
+        case 6: return (sts_val->files_pending);
+        case 7: return (sts_val->bytes_pending);
+        case 8: return (sts_val->deletes_pending);
+        default:
+                 goto out;
+        }
+
+out:
+        return NULL;
+}
+
+int
+gf_cli_print_status (char **title_values,
+                     gf_cli_gsync_status_t **sts_vals,
+                     int *spacing, int gsync_count,
+                     int number_of_fields, int is_detail)
+{
+        int     indents                  = 0;
+        int     i                        = 0;
+        int     j                        = 0;
+        int     ret                      = 0;
+        int     total_spacing            = 0;
+        char  **output_values            = NULL;
+        char   *tmp                      = NULL;
+        char   *hyphens                  = NULL;
+        char    heading[PATH_MAX]        = {0, };
+        char    indent_spaces[PATH_MAX]  = {0, };
+
+        /* calculating spacing for hyphens */
+        for (i = 0; i < number_of_fields; i++) {
+                /* Suppressing master and slave output for status detail */
+                if ((is_detail) && ((i == 1) || (i == 2))) {
+                        total_spacing++;
+                        continue;
+                } else if ((!is_detail) && (i > 4)) {
+                       /* Suppressing detailed output for
+                        * status */
+                        continue;
+                }
+                spacing[i] += 3; /* Adding extra space to
+                                    distinguish between fields */
+                total_spacing += spacing[i];
+        }
+        total_spacing += 4; /* For the spacing between the fields */
+
+        /* char pointers for each field */
+        output_values = GF_CALLOC (number_of_fields, sizeof (char *),
+                                   gf_common_mt_char);
+        if (!output_values) {
+                ret = -1;
+                goto out;
+        }
+        for (i = 0; i < number_of_fields; i++) {
+                output_values[i] = GF_CALLOC (spacing[i] + 1, sizeof (char),
+                                              gf_common_mt_char);
+                if (!output_values[i]) {
+                        ret = -1;
+                        goto out;
+                }
+        }
+
+        hyphens = GF_CALLOC (total_spacing + 1, sizeof (char),
+                             gf_common_mt_char);
+        if (!hyphens) {
+                ret = -1;
+                goto out;
+        }
+
+        ret = snprintf(heading, sizeof(heading), "MASTER: %s  SLAVE: %s",
+                       sts_vals[0]->master, sts_vals[0]->slave);
         if (ret) {
-                gf_log ("cli", GF_LOG_INFO, "No active geo-replication sessions"
-                        "present for the selected");
+                if (ret < sizeof(heading))
+                        heading[ret] = '\0';
+                else
+                        heading[sizeof(heading) - 1] = '\0';
                 ret = 0;
+        } else {
+                ret = -1;
                 goto out;
         }
 
-        for (i = 1; i <= gsync_count; i++) {
-                snprintf (mst, sizeof(mst), "master%d", i);
-                snprintf (slv, sizeof(slv), "slave%d", i);
-                snprintf (sts, sizeof(sts), "status%d", i);
+        if (is_detail) {
+                cli_out (" ");
+                if (strlen(heading) > total_spacing)
+                        cli_out ("%s", heading);
+                else {
+                        /* Printing the heading with centre justification */
+                        indents = (total_spacing - strlen(heading)) / 2;
+                        memset (indent_spaces, ' ', indents);
+                        indent_spaces[indents] = '\0';
+                        ret = snprintf (hyphens, total_spacing, "%s%s",
+                                        indent_spaces, heading);
+                        if (ret) {
+                                hyphens[ret] = '\0';
+                                cli_out ("%s", hyphens);
+                                ret = 0;
+                        } else {
+                                ret = -1;
+                                goto out;
+                        }
+                }
+                cli_out (" ");
+        }
+
+        /* setting the title "NODE", "MASTER", etc. from title_values[]
+           and printing the same */
+        for (j = 0; j < number_of_fields; j++) {
+                /* Suppressing master and slave output for status detail */
+                if ((is_detail) && ((j == 1) || (j == 2))) {
+                        output_values[j][0] = '\0';
+                        continue;
+                } else if ((!is_detail) && (j > 4)) {
+                       /* Suppressing detailed output for
+                        * status */
+                       output_values[j][0] = '\0';
+                        continue;
+                }
+                memset (output_values[j], ' ', spacing[j]);
+                memcpy (output_values[j], title_values[j],
+                        strlen(title_values[j]));
+                output_values[j][spacing[j]] = '\0';
+        }
+        cli_out ("%s %s %s %s %s %s %s %s %s", output_values[0],
+                 output_values[1], output_values[2], output_values[3],
+                 output_values[4], output_values[5], output_values[6],
+                 output_values[7], output_values[8]);
+
+        /* setting and printing the hyphens */
+        memset (hyphens, '-', total_spacing);
+        hyphens[total_spacing] = '\0';
+        cli_out ("%s", hyphens);
+
+        for (i = 0; i < gsync_count; i++) {
+                for (j = 0; j < number_of_fields; j++) {
+                        /* Suppressing master and slave output for
+                         * status detail */
+                        if ((is_detail) && ((j == 1) || (j == 2))) {
+                                output_values[j][0] = '\0';
+                                continue;
+                        }  else if ((!is_detail) && (j > 4)) {
+                                /* Suppressing detailed output for
+                                 * status */
+                                output_values[j][0] = '\0';
+                                continue;
+                        }
+                        tmp = get_struct_variable(j, sts_vals[i]);
+                        if (!tmp) {
+                                gf_log ("", GF_LOG_ERROR,
+                                        "struct member empty.");
+                                ret = -1;
+                                goto out;
+                        }
+                        memset (output_values[j], ' ', spacing[j]);
+                        memcpy (output_values[j], tmp, strlen (tmp));
+                        output_values[j][spacing[j]] = '\0';
+                }
+
+                cli_out ("%s %s %s %s %s %s %s %s %s", output_values[0],
+                         output_values[1], output_values[2], output_values[3],
+                         output_values[4], output_values[5], output_values[6],
+                         output_values[7], output_values[8]);
+        }
+
+out:
+        if (output_values) {
+                for (i = 0; i < number_of_fields; i++) {
+                        if (output_values[i])
+                                GF_FREE (output_values[i]);
+                }
+                GF_FREE (output_values);
+        }
 
-                ret = dict_get_str (dict, mst, &mst_val);
+        if (hyphens)
+                GF_FREE (hyphens);
+
+        return ret;
+}
+
+int
+gf_cli_read_status_data (dict_t *dict,
+                         gf_cli_gsync_status_t **sts_vals,
+                         int *spacing, int gsync_count,
+                         int number_of_fields)
+{
+        int     ret            = 0;
+        int     i              = 0;
+        int     j              = 0;
+        char    mst[PATH_MAX]  = {0, };
+        char    slv[PATH_MAX]  = {0, };
+        char    sts[PATH_MAX]  = {0, };
+        char    nds[PATH_MAX]  = {0, };
+        char   *status         = NULL;
+        char   *tmp            = NULL;
+
+        /* Storing per node status info in each object */
+        for (i = 0; i < gsync_count; i++) {
+                snprintf (nds, sizeof(nds), "node%d", i + 1);
+                snprintf (mst, sizeof(mst), "master%d", i + 1);
+                snprintf (slv, sizeof(slv), "slave%d", i + 1);
+                snprintf (sts, sizeof(sts), "status%d", i + 1);
+
+                /* Fetching the values from dict, and calculating
+                   the max length for each field */
+                ret = dict_get_str (dict, nds, &(sts_vals[i]->node));
                 if (ret)
                         goto out;
 
-                ret = dict_get_str (dict, slv, &slv_val);
+                ret = dict_get_str (dict, mst, &(sts_vals[i]->master));
                 if (ret)
                         goto out;
 
-                ret = dict_get_str (dict, sts, &sts_val);
+                ret = dict_get_str (dict, slv, &(sts_vals[i]->slave));
                 if (ret)
                         goto out;
 
-                cli_out ("%-20s %-50s %-10s", mst_val,
-                         slv_val, sts_val);
+                ret = dict_get_str (dict, sts, &status);
+                if (ret)
+                        goto out;
+
+                /* Fetching health and uptime from sts_val */
+                ret = gf_cli_fetch_gsyncd_status_values (status, sts_vals[i]);
+                if (ret)
+                        goto out;
 
+                for (j = 0; j < number_of_fields; j++) {
+                        tmp = get_struct_variable(j, sts_vals[i]);
+                        if (!tmp) {
+                                gf_log ("", GF_LOG_ERROR,
+                                        "struct member empty.");
+                                ret = -1;
+                                goto out;
+                        }
+                        if (strlen (tmp) > spacing[j])
+                                spacing[j] = strlen (tmp);
+                }
         }
 
- out:
+out:
         return ret;
+}
+
+int
+gf_cli_gsync_status_output (dict_t *dict, int status_detail)
+{
+        int                     gsync_count    = 0;
+        int                     i              = 0;
+        int                     j              = 0;
+        int                     ret            = 0;
+        int                     spacing[10]    = {0};
+        int                     num_of_fields  = 9;
+        char                    errmsg[1024]   = "";
+        char                   *master         = NULL;
+        char                   *slave          = NULL;
+        char                   *tmp            = NULL;
+        char                   *title_values[] = {"NODE", "MASTER", "SLAVE",
+                                                  "HEALTH", "UPTIME",
+                                                  "FILES SYNCD",
+                                                  "FILES PENDING",
+                                                  "BYTES PENDING",
+                                                  "DELETES PENDING"};
+        gf_cli_gsync_status_t **sts_vals = NULL;
+
+        /* Checks if any session is active or not */
+        ret = dict_get_int32 (dict, "gsync-count", &gsync_count);
+        if (ret) {
+                ret = dict_get_str (dict, "master", &master);
+
+                ret = dict_get_str (dict, "slave", &slave);
 
+                if (master) {
+                        if (slave)
+                                snprintf (errmsg, sizeof(errmsg), "No active "
+                                          "geo-replication sessions between %s"
+                                          " and %s", master, slave);
+                        else
+                                snprintf (errmsg, sizeof(errmsg), "No active "
+                                          "geo-replication sessions for %s",
+                                          master);
+                } else
+                        snprintf (errmsg, sizeof(errmsg), "No active "
+                                  "geo-replication sessions");
+
+                gf_log ("cli", GF_LOG_INFO, "%s", errmsg);
+                cli_out ("%s", errmsg);
+                ret = 0;
+                goto out;
+        }
+
+        for (i = 0; i < num_of_fields; i++)
+                spacing[i] = strlen(title_values[i]);
+
+        /* gsync_count = number of nodes reporting output.
+           each sts_val object will store output of each
+           node */
+        sts_vals = GF_CALLOC (gsync_count, sizeof (gf_cli_gsync_status_t *),
+                              gf_common_mt_char);
+        if (!sts_vals) {
+                ret = -1;
+                goto out;
+        }
+        for (i = 0; i < gsync_count; i++) {
+                sts_vals[i] = GF_CALLOC (1, sizeof (gf_cli_gsync_status_t),
+                                         gf_common_mt_char);
+                if (!sts_vals[i]) {
+                        ret = -1;
+                        goto out;
+                }
+        }
+
+        ret = gf_cli_read_status_data (dict, sts_vals, spacing,
+                                       gsync_count, num_of_fields);
+        if (ret) {
+                gf_log ("", GF_LOG_ERROR, "Unable to read status data");
+                goto out;
+        }
+
+        ret = gf_cli_print_status (title_values, sts_vals, spacing, gsync_count,
+                                   num_of_fields, status_detail);
+        if (ret) {
+                gf_log ("", GF_LOG_ERROR, "Unable to print status output");
+                goto out;
+        }
+
+out:
+        if (sts_vals) {
+                for (i = 0; i < gsync_count; i++) {
+                        for (j = 3; j < num_of_fields; j++) {
+                                tmp = get_struct_variable(j, sts_vals[i]);
+                                if (tmp)
+                                        GF_FREE (tmp);
+                        }
+                }
+                GF_FREE (sts_vals);
+        }
+
+        return ret;
+}
+
+static int32_t
+write_contents_to_common_pem_file (dict_t *dict, int output_count)
+{
+        char            *workdir                   = NULL;
+        char             common_pem_file[PATH_MAX] = "";
+        char            *output                    = NULL;
+        char             output_name[PATH_MAX]     = "";
+        int              bytes_writen              = 0;
+        int              fd                        = -1;
+        int              ret                       = -1;
+        int              i                         = -1;
+
+        ret = dict_get_str (dict, "glusterd_workdir", &workdir);
+        if (ret || !workdir) {
+                gf_log ("", GF_LOG_ERROR, "Unable to fetch workdir");
+                ret = -1;
+                goto out;
+        }
+
+        snprintf (common_pem_file, sizeof(common_pem_file),
+                  "%s/geo-replication/common_secret.pem.pub",
+                  workdir);
+
+        unlink (common_pem_file);
+
+        fd = open (common_pem_file, O_WRONLY | O_CREAT, 0600);
+        if (fd == -1) {
+                gf_log ("", GF_LOG_ERROR, "Failed to open %s"
+                        " Error : %s", common_pem_file,
+                        strerror (errno));
+                ret = -1;
+                goto out;
+        }
+
+        for (i = 1; i <= output_count; i++) {
+                memset (output_name, '\0', sizeof (output_name));
+                snprintf (output_name, sizeof (output_name),
+                          "output_%d", i);
+                ret = dict_get_str (dict, output_name, &output);
+                if (ret) {
+                        gf_log ("", GF_LOG_ERROR, "Failed to get %s.",
+                                output_name);
+                        cli_out ("Unable to fetch output.");
+                }
+                if (output) {
+                        bytes_writen = write (fd, output, strlen(output));
+                        if (bytes_writen != strlen(output)) {
+                                gf_log ("", GF_LOG_ERROR, "Failed to write "
+                                        "to %s", common_pem_file);
+                                ret = -1;
+                                goto out;
+                        }
+                        /* Adding the new line character */
+                        bytes_writen = write (fd, "\n", strlen("\n"));
+                        if (bytes_writen != strlen("\n")) {
+                                gf_log ("", GF_LOG_ERROR,
+                                        "Failed to add new line char");
+                                ret = -1;
+                                goto out;
+                        }
+                        output = NULL;
+                }
+        }
+
+        cli_out ("Common secret pub file present at %s", common_pem_file);
+        ret = 0;
+out:
+        if (fd)
+                close (fd);
+
+        gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
 }
 
 int
-gf_cli3_1_gsync_set_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_sys_exec_cbk (struct rpc_req *req, struct iovec *iov,
+                     int count, void *myframe)
+{
+        int                     ret     = -1;
+        int                     output_count     = -1;
+        int                     i     = -1;
+        char                   *output  = NULL;
+        char                   *command = NULL;
+        char                    output_name[PATH_MAX] = "";
+        gf_cli_rsp              rsp     = {0, };
+        dict_t                  *dict   = NULL;
+        call_frame_t            *frame  = NULL;
+
+        if (req->rpc_status == -1) {
+                ret = -1;
+                goto out;
+        }
+
+        frame = myframe;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
+        if (ret < 0) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
+                goto out;
+       }
+
+        dict = dict_new ();
+
+        if (!dict) {
+                ret = -1;
+                goto out;
+        }
+
+        ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len, &dict);
+
+        if (ret)
+                goto out;
+
+        if (rsp.op_ret) {
+                cli_err ("%s", rsp.op_errstr ? rsp.op_errstr :
+                         "Command failed.");
+                ret = rsp.op_ret;
+                goto out;
+        }
+
+        ret = dict_get_int32 (dict, "output_count", &output_count);
+        if (ret) {
+                cli_out ("Command executed successfully.");
+                ret = 0;
+                goto out;
+        }
+
+        ret = dict_get_str (dict, "command", &command);
+        if (ret) {
+                gf_log ("", GF_LOG_ERROR,
+                        "Unable to get command from dict");
+                goto out;
+        }
+
+        if (!strcmp (command, "gsec_create")) {
+                ret = write_contents_to_common_pem_file (dict, output_count);
+                if (!ret)
+                        goto out;
+        }
+
+        for (i = 1; i <= output_count; i++) {
+                memset (output_name, '\0', sizeof (output_name));
+                snprintf (output_name, sizeof (output_name),
+                          "output_%d", i);
+                ret = dict_get_str (dict, output_name, &output);
+                if (ret) {
+                        gf_log ("", GF_LOG_ERROR, "Failed to get %s.",
+                                output_name);
+                        cli_out ("Unable to fetch output.");
+                }
+                if (output) {
+                        cli_out ("%s", output);
+                        output = NULL;
+                }
+        }
+
+        ret = 0;
+out:
+        if (dict)
+                dict_unref (dict);
+        cli_cmd_broadcast_response (ret);
+
+        free (rsp.dict.dict_val);
+
+        return ret;
+}
+
+int
+gf_cli_copy_file_cbk (struct rpc_req *req, struct iovec *iov,
+                      int count, void *myframe)
+{
+        int                     ret     = -1;
+        gf_cli_rsp              rsp     = {0, };
+        dict_t                  *dict   = NULL;
+        call_frame_t            *frame  = NULL;
+
+        if (req->rpc_status == -1) {
+                ret = -1;
+                goto out;
+        }
+
+        frame = myframe;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
+        if (ret < 0) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
+                goto out;
+       }
+
+        dict = dict_new ();
+
+        if (!dict) {
+                ret = -1;
+                goto out;
+        }
+
+        ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len, &dict);
+
+       if (ret)
+                goto out;
+
+        if (rsp.op_ret) {
+                cli_err ("%s", rsp.op_errstr ? rsp.op_errstr :
+                         "Copy unsuccessful");
+                ret = rsp.op_ret;
+                goto out;
+        }
+
+        cli_out ("Successfully copied file.");
+
+out:
+        if (dict)
+                dict_unref (dict);
+        cli_cmd_broadcast_response (ret);
+
+        free (rsp.dict.dict_val);
+
+        return ret;
+}
+
+int
+gf_cli_gsync_set_cbk (struct rpc_req *req, struct iovec *iov,
                          int count, void *myframe)
 {
-        int                     ret     = 0;
-        gf1_cli_gsync_set_rsp   rsp     = {0, };
+        int                     ret     = -1;
+        gf_cli_rsp              rsp     = {0, };
         dict_t                  *dict   = NULL;
         char                    *gsync_status = NULL;
         char                    *master = NULL;
         char                    *slave  = NULL;
+        int32_t                  type   = 0;
+        call_frame_t            *frame  = NULL;
+        gf_boolean_t             status_detail = _gf_false;
+
 
         if (req->rpc_status == -1) {
                 ret = -1;
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_gsync_set_rsp);
+        frame = myframe;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR,
-                        "Unable to get response structure");
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
@@ -2958,8 +4392,17 @@ gf_cli3_1_gsync_set_cbk (struct rpc_req *req, struct iovec *iov,
         if (ret)
                 goto out;
 
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_vol_gsync (dict, rsp.op_ret, rsp.op_errno,
+                                                rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
         if (rsp.op_ret) {
-                cli_out ("%s", rsp.op_errstr ? rsp.op_errstr :
+                cli_err ("%s", rsp.op_errstr ? rsp.op_errstr :
                          GEOREP" command unsuccessful");
                 ret = rsp.op_ret;
                 goto out;
@@ -2971,8 +4414,13 @@ gf_cli3_1_gsync_set_cbk (struct rpc_req *req, struct iovec *iov,
         else
                 ret = 0;
 
+        ret = dict_get_int32 (dict, "type", &type);
+        if (ret) {
+                gf_log (frame->this->name, GF_LOG_ERROR, "failed to get type");
+                goto out;
+        }
 
-        switch (rsp.type) {
+        switch (type) {
                 case GF_GSYNC_OPTION_TYPE_START:
                 case GF_GSYNC_OPTION_TYPE_STOP:
                         if (dict_get_str (dict, "master", &master) != 0)
@@ -2982,69 +4430,129 @@ gf_cli3_1_gsync_set_cbk (struct rpc_req *req, struct iovec *iov,
 
                         cli_out ("%s " GEOREP " session between %s & %s"
                                  " has been successful",
-                                 rsp.type == GF_GSYNC_OPTION_TYPE_START ?
+                                 type == GF_GSYNC_OPTION_TYPE_START ?
                                   "Starting" : "Stopping",
                                  master, slave);
                 break;
 
                 case GF_GSYNC_OPTION_TYPE_CONFIG:
-                        ret = gf_cli3_1_gsync_config_command (dict);
+                        ret = gf_cli_gsync_config_command (dict);
                 break;
 
                 case GF_GSYNC_OPTION_TYPE_STATUS:
-                        ret = gf_cli3_1_gsync_out_status (dict);
-                        goto out;
+                        status_detail = dict_get_str_boolean (dict,
+                                                              "status-detail",
+                                                              _gf_false);
+                        ret = gf_cli_gsync_status_output (dict, status_detail);
+                break;
+
+                case GF_GSYNC_OPTION_TYPE_DELETE:
+                        if (dict_get_str (dict, "master", &master) != 0)
+                                master = "???";
+                        if (dict_get_str (dict, "slave", &slave) != 0)
+                                slave = "???";
+                        cli_out ("Deleting " GEOREP " session between %s & %s"
+                                 " has been successful", master, slave);
+                break;
+
+                case GF_GSYNC_OPTION_TYPE_CREATE:
+                        if (dict_get_str (dict, "master", &master) != 0)
+                                master = "???";
+                        if (dict_get_str (dict, "slave", &slave) != 0)
+                                slave = "???";
+                        cli_out ("Creating " GEOREP " session between %s & %s"
+                                 " has been successful", master, slave);
+                break;
+
                 default:
                         cli_out (GEOREP" command executed successfully");
         }
 
 out:
-
+        if (dict)
+                dict_unref (dict);
         cli_cmd_broadcast_response (ret);
 
+        free (rsp.dict.dict_val);
+
         return ret;
 }
 
 int32_t
-gf_cli3_1_gsync_set (call_frame_t *frame, xlator_t *this,
-                     void *data)
+gf_cli_sys_exec (call_frame_t *frame, xlator_t *this, void *data)
 {
         int                      ret    = 0;
         dict_t                  *dict   = NULL;
-        gf1_cli_gsync_set_req    req;
+        gf_cli_req               req = {{0,}};
 
         if (!frame || !this || !data) {
                 ret = -1;
+                gf_log ("cli", GF_LOG_ERROR, "Invalid data");
                 goto out;
         }
 
         dict = data;
 
-        ret = dict_allocate_and_serialize (dict,
-                                           &req.dict.dict_val,
-                                           (size_t *) &req.dict.dict_len);
-        if (ret < 0) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "failed to serialize the data");
+        ret = cli_to_glusterd (&req, frame, gf_cli_sys_exec_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_SYS_EXEC, this, cli_rpc_prog,
+                               NULL);
+out:
+        GF_FREE (req.dict.dict_val);
+        return ret;
+}
 
+int32_t
+gf_cli_copy_file (call_frame_t *frame, xlator_t *this, void *data)
+{
+        int                      ret    = 0;
+        dict_t                  *dict   = NULL;
+        gf_cli_req               req = {{0,}};
+
+        if (!frame || !this || !data) {
+                ret = -1;
+                gf_log ("cli", GF_LOG_ERROR, "Invalid data");
                 goto out;
         }
 
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_GSYNC_SET, NULL,
-                              this, gf_cli3_1_gsync_set_cbk,
-                              (xdrproc_t) xdr_gf1_cli_gsync_set_req);
+        dict = data;
 
+        ret = cli_to_glusterd (&req, frame, gf_cli_copy_file_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_COPY_FILE, this, cli_rpc_prog,
+                               NULL);
 out:
+        GF_FREE (req.dict.dict_val);
         return ret;
 }
 
-void*
-cli_profile_info_elem (void *a, int index)
+int32_t
+gf_cli_gsync_set (call_frame_t *frame, xlator_t *this,
+                     void *data)
 {
-        return ((cli_profile_info_t *)a) + index;
+        int                      ret    = 0;
+        dict_t                  *dict   = NULL;
+        gf_cli_req               req = {{0,}};
+
+        if (!frame || !this || !data) {
+                ret = -1;
+                goto out;
+        }
+
+        dict = data;
+
+        ret = cli_to_glusterd (&req, frame, gf_cli_gsync_set_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_GSYNC_SET, this, cli_rpc_prog,
+                               NULL);
+
+out:
+        GF_FREE (req.dict.dict_val);
+
+        return ret;
 }
 
+
 int
 cli_profile_info_percentage_cmp (void *a, void *b)
 {
@@ -3063,19 +4571,6 @@ cli_profile_info_percentage_cmp (void *a, void *b)
         return ret;
 }
 
-void
-cli_profile_info_swap (void *a, void *b)
-{
-        cli_profile_info_t *ia = NULL;
-        cli_profile_info_t *ib = NULL;
-        cli_profile_info_t tmp = {0};
-
-        ia = a;
-        ib = b;
-        tmp = *ia;
-        *ia = *ib;
-        *ib = tmp;
-}
 
 void
 cmd_profile_volume_brick_out (dict_t *dict, int count, int interval)
@@ -3085,7 +4580,6 @@ cmd_profile_volume_brick_out (dict_t *dict, int count, int interval)
         uint64_t                sec = 0;
         uint64_t                r_count = 0;
         uint64_t                w_count = 0;
-        char                    *brick = NULL;
         uint64_t                rb_counts[32] = {0};
         uint64_t                wb_counts[32] = {0};
         cli_profile_info_t      profile_info[GF_FOP_MAXVALUE] = {{0}};
@@ -3098,9 +4592,6 @@ cmd_profile_volume_brick_out (dict_t *dict, int count, int interval)
         int                     ret = 0;
         double                  total_percentage_latency = 0;
 
-        memset (key, 0, sizeof (key));
-        snprintf (key, sizeof (key), "%d-brick", count);
-        ret = dict_get_str (dict, key, &brick);
         for (i = 0; i < 32; i++) {
                 memset (key, 0, sizeof (key));
                 snprintf (key, sizeof (key), "%d-%d-read-%d", count,
@@ -3135,7 +4626,7 @@ cmd_profile_volume_brick_out (dict_t *dict, int count, int interval)
                 snprintf (key, sizeof (key), "%d-%d-%d-maxlatency", count,
                           interval, i);
                 ret = dict_get_double (dict, key, &profile_info[i].max_latency);
-                profile_info[i].fop_name = gf_fop_list[i];
+                profile_info[i].fop_name = (char *)gf_fop_list[i];
 
                 total_percentage_latency +=
                        (profile_info[i].fop_hits * profile_info[i].avg_latency);
@@ -3163,7 +4654,6 @@ cmd_profile_volume_brick_out (dict_t *dict, int count, int interval)
         ret = dict_get_uint64 (dict, key, &w_count);
 
         if (ret == 0) {
-                cli_out ("Brick: %s", brick);
         }
 
         if (interval == -1)
@@ -3248,26 +4738,30 @@ cmd_profile_volume_brick_out (dict_t *dict, int count, int interval)
 }
 
 int32_t
-gf_cli3_1_profile_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_profile_volume_cbk (struct rpc_req *req, struct iovec *iov,
                               int count, void *myframe)
 {
-        gf1_cli_stats_volume_rsp          rsp   = {0,};
+        gf_cli_rsp                        rsp   = {0,};
         int                               ret   = -1;
         dict_t                            *dict = NULL;
-	gf1_cli_stats_op                  op = GF_CLI_STATS_NONE;
+        gf1_cli_stats_op                  op = GF_CLI_STATS_NONE;
         char                              key[256] = {0};
         int                               interval = 0;
         int                               i = 1;
         int32_t                           brick_count = 0;
         char                              *volname = NULL;
+        char                              *brick = NULL;
+        char                              str[1024] = {0,};
+
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
         gf_log ("cli", GF_LOG_DEBUG, "Received resp to profile");
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_stats_volume_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
@@ -3278,8 +4772,8 @@ gf_cli3_1_profile_volume_cbk (struct rpc_req *req, struct iovec *iov,
                 goto out;
         }
 
-        ret = dict_unserialize (rsp.stats_info.stats_info_val,
-                                rsp.stats_info.stats_info_len,
+        ret = dict_unserialize (rsp.dict.dict_val,
+                                rsp.dict.dict_len,
                                 &dict);
 
         if (ret) {
@@ -3287,7 +4781,17 @@ gf_cli3_1_profile_volume_cbk (struct rpc_req *req, struct iovec *iov,
                                 "Unable to allocate memory");
                 goto out;
         } else {
-                dict->extra_stdfree = rsp.stats_info.stats_info_val;
+                dict->extra_stdfree = rsp.dict.dict_val;
+        }
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_vol_profile (dict, rsp.op_ret,
+                                                  rsp.op_errno,
+                                                  rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
         }
 
         ret = dict_get_str (dict, "volname", &volname);
@@ -3299,7 +4803,7 @@ gf_cli3_1_profile_volume_cbk (struct rpc_req *req, struct iovec *iov,
                 goto out;
 
         if (rsp.op_ret && strcmp (rsp.op_errstr, "")) {
-                cli_out ("%s", rsp.op_errstr);
+                cli_err ("%s", rsp.op_errstr);
         } else {
                 switch (op) {
                 case GF_CLI_STATS_START:
@@ -3337,11 +4841,28 @@ gf_cli3_1_profile_volume_cbk (struct rpc_req *req, struct iovec *iov,
                 goto out;
 
         if (!brick_count) {
-                cli_out ("All bricks of volume %s are down.", volname);
+                cli_err ("All bricks of volume %s are down.", volname);
                 goto out;
         }
 
         while (i <= brick_count) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%d-brick", i);
+                ret = dict_get_str (dict, key, &brick);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Couldn't get brick name");
+                        goto out;
+                }
+
+                ret = dict_get_str_boolean (dict, "nfs", _gf_false);
+                if (ret)
+                        snprintf (str, sizeof (str), "NFS Server : %s", brick);
+                else
+                        snprintf (str, sizeof (str), "Brick: %s", brick);
+                cli_out ("%s", str);
+                memset (str, '-', strlen (str));
+                cli_out ("%s", str);
+
                 snprintf (key, sizeof (key), "%d-cumulative", i);
                 ret = dict_get_int32 (dict, key, &interval);
                 if (ret == 0) {
@@ -3359,18 +4880,17 @@ gf_cli3_1_profile_volume_cbk (struct rpc_req *req, struct iovec *iov,
 out:
         if (dict)
                 dict_unref (dict);
-        if (rsp.op_errstr)
-                free (rsp.op_errstr);
+        free (rsp.op_errstr);
         cli_cmd_broadcast_response (ret);
         return ret;
 }
 
 int32_t
-gf_cli3_1_profile_volume (call_frame_t *frame, xlator_t *this, void *data)
+gf_cli_profile_volume (call_frame_t *frame, xlator_t *this, void *data)
 {
         int                        ret   = -1;
-        gf1_cli_stats_volume_req   req   = {0,};
-        dict_t			   *dict = NULL;
+        gf_cli_req                 req   = {{0,}};
+        dict_t                     *dict = NULL;
 
         GF_ASSERT (frame);
         GF_ASSERT (this);
@@ -3379,67 +4899,65 @@ gf_cli3_1_profile_volume (call_frame_t *frame, xlator_t *this, void *data)
         if (!frame || !this || !data)
                 goto out;
         dict = data;
-        ret = dict_get_str (dict, "volname", &req.volname);
-        if (ret)
-                goto out;
 
-        ret = dict_get_int32 (dict, "op", (int32_t*)&req.op);
-        if (ret)
-                goto out;
-
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_PROFILE_VOLUME, NULL,
-                              this, gf_cli3_1_profile_volume_cbk,
-                              (xdrproc_t) xdr_gf1_cli_stats_volume_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_profile_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_PROFILE_VOLUME, this, cli_rpc_prog,
+                               NULL);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+
+        GF_FREE (req.dict.dict_val);
         return ret;
 }
 
 int32_t
-gf_cli3_1_top_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_top_volume_cbk (struct rpc_req *req, struct iovec *iov,
                               int count, void *myframe)
 {
-        gf1_cli_stats_volume_rsp        rsp   = {0,};
+        gf_cli_rsp                        rsp   = {0,};
         int                               ret   = -1;
-        dict_t                            *dict = NULL;
-        gf1_cli_stats_op                op = GF_CLI_STATS_NONE;
+        dict_t                           *dict = NULL;
+        gf1_cli_stats_op                  op = GF_CLI_STATS_NONE;
         char                              key[256] = {0};
         int                               i = 0;
         int32_t                           brick_count = 0;
         char                              brick[1024];
         int32_t                           members = 0;
-        char                              *filename;
-        char                              *bricks;
-        uint64_t                           value = 0;
+        char                             *filename;
+        char                             *bricks;
+        uint64_t                          value = 0;
         int32_t                           j = 0;
         gf1_cli_top_op                    top_op = GF_CLI_TOP_NONE;
         uint64_t                          nr_open = 0;
         uint64_t                          max_nr_open = 0;
         double                            throughput = 0;
         double                            time = 0;
-        long int                          time_sec = 0;
-        long int                           time_usec = 0;
-        struct tm                         *tm = NULL;
+        int32_t                           time_sec = 0;
+        long int                          time_usec = 0;
         char                              timestr[256] = {0, };
-        char                              *openfd_str = NULL;
+        char                             *openfd_str = NULL;
+        gf_boolean_t                      nfs = _gf_false;
+        gf_boolean_t                      clear_stats = _gf_false;
+        int                               stats_cleared = 0;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
         gf_log ("cli", GF_LOG_DEBUG, "Received resp to top");
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_stats_volume_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "Unable to decode response");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
         if (rsp.op_ret) {
                 if (strcmp (rsp.op_errstr, ""))
-                        cli_out ("%s", rsp.op_errstr);
-                cli_out ("volume top unsuccessful");
+                        cli_err ("%s", rsp.op_errstr);
+                cli_err ("volume top unsuccessful");
                 ret = rsp.op_ret;
                 goto out;
         }
@@ -3451,8 +4969,8 @@ gf_cli3_1_top_volume_cbk (struct rpc_req *req, struct iovec *iov,
                 goto out;
         }
 
-        ret = dict_unserialize (rsp.stats_info.stats_info_val,
-                                rsp.stats_info.stats_info_len,
+        ret = dict_unserialize (rsp.dict.dict_val,
+                                rsp.dict.dict_len,
                                 &dict);
 
         if (ret) {
@@ -3467,6 +4985,18 @@ gf_cli3_1_top_volume_cbk (struct rpc_req *req, struct iovec *iov,
                 ret = 0;
                 goto out;
         }
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_vol_top (dict, rsp.op_ret,
+                                              rsp.op_errno,
+                                              rsp.op_errstr);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                }
+                goto out;
+        }
+
         ret = dict_get_int32 (dict, "count", &brick_count);
         if (ret)
                 goto out;
@@ -3474,13 +5004,35 @@ gf_cli3_1_top_volume_cbk (struct rpc_req *req, struct iovec *iov,
         ret = dict_get_int32 (dict, key, (int32_t*)&top_op);
         if (ret)
                 goto out;
+
+        clear_stats = dict_get_str_boolean (dict, "clear-stats", _gf_false);
+
         while (i < brick_count) {
                 i++;
                 snprintf (brick, sizeof (brick), "%d-brick", i);
                 ret = dict_get_str (dict, brick, &bricks);
                 if (ret)
                         goto out;
-                cli_out ("Brick: %s", bricks);
+
+                nfs = dict_get_str_boolean (dict, "nfs", _gf_false);
+
+                if (clear_stats) {
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "%d-stats-cleared", i);
+                        ret = dict_get_int32 (dict, key, &stats_cleared);
+                        if (ret)
+                                goto out;
+                        cli_out (stats_cleared ? "Cleared stats for %s %s" :
+                                 "Failed to clear stats for %s %s",
+                                 nfs ? "NFS server on" : "brick", bricks);
+                        continue;
+                }
+
+                if (nfs)
+                        cli_out ("NFS Server : %s", bricks);
+                else
+                        cli_out ("Brick: %s", bricks);
+
                 snprintf(key, sizeof (key), "%d-members", i);
                 ret = dict_get_int32 (dict, key, &members);
 
@@ -3525,7 +5077,14 @@ gf_cli3_1_top_volume_cbk (struct rpc_req *req, struct iovec *iov,
                         if (!members) {
                                 continue;
                         }
-                        cli_out ("MBps\t\tfilename\t\t time\n========================");
+                        cli_out ("%*s %-*s %-*s",
+                                 VOL_TOP_PERF_SPEED_WIDTH, "MBps",
+                                 VOL_TOP_PERF_FILENAME_DEF_WIDTH, "Filename",
+                                 VOL_TOP_PERF_TIME_WIDTH, "Time");
+                        cli_out ("%*s %-*s %-*s",
+                                 VOL_TOP_PERF_SPEED_WIDTH, "====",
+                                 VOL_TOP_PERF_FILENAME_DEF_WIDTH, "========",
+                                 VOL_TOP_PERF_TIME_WIDTH, "====");
                         break;
                 default:
                         goto out;
@@ -3550,14 +5109,27 @@ gf_cli3_1_top_volume_cbk (struct rpc_req *req, struct iovec *iov,
                                 ret = dict_get_int32 (dict, key, (int32_t *)&time_usec);
                                 if (ret)
                                         goto out;
-                                tm    = localtime (&time_sec);
-                                if (!tm)
-                                        goto out;
-                                strftime (timestr, 256, "%Y-%m-%d %H:%M:%S", tm);
-                                snprintf (timestr + strlen (timestr), 256 - strlen (timestr),
+                                gf_time_fmt (timestr, sizeof timestr,
+                                             time_sec, gf_timefmt_FT);
+                                snprintf (timestr + strlen (timestr), sizeof timestr - strlen (timestr),
                                   ".%"GF_PRI_SUSECONDS, time_usec);
-
-                                cli_out ("%"PRIu64"\t\t%s\t\t%s", value, filename, timestr);
+                                if (strlen (filename) < VOL_TOP_PERF_FILENAME_DEF_WIDTH)
+                                        cli_out ("%*"PRIu64" %-*s %-*s",
+                                                 VOL_TOP_PERF_SPEED_WIDTH,
+                                                 value,
+                                                 VOL_TOP_PERF_FILENAME_DEF_WIDTH,
+                                                 filename,
+                                                 VOL_TOP_PERF_TIME_WIDTH,
+                                                 timestr);
+                                else
+                                        cli_out ("%*"PRIu64" ...%-*s %-*s",
+                                                 VOL_TOP_PERF_SPEED_WIDTH,
+                                                 value,
+                                                 VOL_TOP_PERF_FILENAME_ALT_WIDTH ,
+                                                 filename + strlen (filename) -
+                                                 VOL_TOP_PERF_FILENAME_ALT_WIDTH,
+                                                 VOL_TOP_PERF_TIME_WIDTH,
+                                                 timestr);
                         } else {
                                 cli_out ("%"PRIu64"\t\t%s", value, filename);
                         }
@@ -3571,16 +5143,15 @@ out:
         if (dict)
                 dict_unref (dict);
 
-        if (rsp.stats_info.stats_info_val)
-                free (rsp.stats_info.stats_info_val);
+        free (rsp.dict.dict_val);
         return ret;
 }
 
 int32_t
-gf_cli3_1_top_volume (call_frame_t *frame, xlator_t *this, void *data)
+gf_cli_top_volume (call_frame_t *frame, xlator_t *this, void *data)
 {
         int                        ret   = -1;
-        gf1_cli_stats_volume_req   req   = {0,};
+        gf_cli_req                 req   = {{0,}};
         dict_t                     *dict = NULL;
 
         GF_ASSERT (frame);
@@ -3590,43 +5161,40 @@ gf_cli3_1_top_volume (call_frame_t *frame, xlator_t *this, void *data)
         if (!frame || !this || !data)
                 goto out;
         dict = data;
-        ret = dict_get_str (dict, "volname", &req.volname);
-        if (ret)
-                goto out;
 
-        ret = dict_get_int32 (dict, "op", (int32_t*)&req.op);
-        if (ret)
-                goto out;
-
-        ret = dict_allocate_and_serialize (dict,
-                                           &req.dict_req.dict_req_val,
-                                           (size_t *)&req.dict_req.dict_req_len);
-
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_PROFILE_VOLUME, NULL,
-                              this, gf_cli3_1_top_volume_cbk,
-                              (xdrproc_t) xdr_gf1_cli_stats_volume_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_top_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_PROFILE_VOLUME, this, cli_rpc_prog,
+                               NULL);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        GF_FREE (req.dict.dict_val);
         return ret;
 }
 
 
 int
-gf_cli3_1_getwd_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_getwd_cbk (struct rpc_req *req, struct iovec *iov,
                        int count, void *myframe)
 {
         gf1_cli_getwd_rsp rsp   = {0,};
-        int               ret   = 0;
+        int               ret   = -1;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
         ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_getwd_rsp);
-        if (ret < 0 || rsp.op_ret == -1) {
-                gf_log ("", GF_LOG_ERROR, "error");
+        if (ret < 0) {
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
+                goto out;
+        }
+
+        if (rsp.op_ret == -1) {
+                cli_err ("getwd failed");
+                ret = rsp.op_ret;
                 goto out;
         }
 
@@ -3642,7 +5210,7 @@ out:
 }
 
 int32_t
-gf_cli3_1_getwd (call_frame_t *frame, xlator_t *this, void *data)
+gf_cli_getwd (call_frame_t *frame, xlator_t *this, void *data)
 {
         int                      ret = -1;
         gf1_cli_getwd_req        req = {0,};
@@ -3655,7 +5223,7 @@ gf_cli3_1_getwd (call_frame_t *frame, xlator_t *this, void *data)
 
         ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
                               GLUSTER_CLI_GETWD, NULL,
-                              this, gf_cli3_1_getwd_cbk,
+                              this, gf_cli_getwd_cbk,
                               (xdrproc_t) xdr_gf1_cli_getwd_req);
 
 out:
@@ -3664,39 +5232,1172 @@ out:
         return ret;
 }
 
-static int
-gf_cli3_1_status_cbk (struct rpc_req *req, struct iovec *iov,
-                         int count, void *myframe)
+void
+cli_print_volume_status_mempool (dict_t *dict, char *prefix)
 {
-        gf1_cli_status_volume_rsp       rsp = {0,};
-        int                             ret = -1;
-        dict_t                          *dict = NULL;
-        char                            *hostname = NULL;
-        char                            *path = NULL;
-        int                             i = 0;
-        int                             port = 0;
-        int                             online = 0;
-        char                            key[1024] = {0,};
-        int                             pid = 0;
-        char                            brick[8192] = {0,};
-        char                            *volname = NULL;
+        int             ret = -1;
+        int32_t         mempool_count = 0;
+        char            *name = NULL;
+        int32_t         hotcount = 0;
+        int32_t         coldcount = 0;
+        uint64_t        paddedsizeof = 0;
+        uint64_t        alloccount = 0;
+        int32_t         maxalloc = 0;
+        uint64_t        pool_misses = 0;
+        int32_t         maxstdalloc = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+
+        GF_ASSERT (dict);
+        GF_ASSERT (prefix);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.mempool-count",prefix);
+        ret = dict_get_int32 (dict, key, &mempool_count);
+        if (ret)
+                goto out;
+
+        cli_out ("Mempool Stats\n-------------");
+        cli_out ("%-30s %9s %9s %12s %10s %8s %8s %12s", "Name", "HotCount",
+                 "ColdCount", "PaddedSizeof", "AllocCount", "MaxAlloc",
+                 "Misses", "Max-StdAlloc");
+        cli_out ("%-30s %9s %9s %12s %10s %8s %8s %12s", "----", "--------",
+                 "---------", "------------", "----------",
+                 "--------", "--------", "------------");
+
+        for (i = 0; i < mempool_count; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.name", prefix, i);
+                ret = dict_get_str (dict, key, &name);
+                if (ret)
+                        goto out;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.hotcount", prefix, i);
+                ret = dict_get_int32 (dict, key, &hotcount);
+                if (ret)
+                        goto out;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.coldcount", prefix, i);
+                ret = dict_get_int32 (dict, key, &coldcount);
+                if (ret)
+                        goto out;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.paddedsizeof",
+                          prefix, i);
+                ret = dict_get_uint64 (dict, key, &paddedsizeof);
+                if (ret)
+                        goto out;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.alloccount", prefix, i);
+                ret = dict_get_uint64 (dict, key, &alloccount);
+                if (ret)
+                        goto out;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.max_alloc", prefix, i);
+                ret = dict_get_int32 (dict, key, &maxalloc);
+                if (ret)
+                        goto out;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.max-stdalloc", prefix, i);
+                ret = dict_get_int32 (dict, key, &maxstdalloc);
+                if (ret)
+                        goto out;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.pool-misses", prefix, i);
+                ret = dict_get_uint64 (dict, key, &pool_misses);
+                if (ret)
+                        goto out;
+
+                cli_out ("%-30s %9d %9d %12"PRIu64" %10"PRIu64" %8d %8"PRIu64
+                         " %12d", name, hotcount, coldcount, paddedsizeof,
+                         alloccount, maxalloc, pool_misses, maxstdalloc);
+        }
+
+out:
+        return;
+
+}
+
+void
+cli_print_volume_status_mem (dict_t *dict, gf_boolean_t notbrick)
+{
+        int             ret = -1;
+        char            *volname = NULL;
+        char            *hostname = NULL;
+        char            *path = NULL;
+        int             online = -1;
+        char            key[1024] = {0,};
+        int             brick_index_max = -1;
+        int             other_count = 0;
+        int             index_max = 0;
+        int             val = 0;
+        int             i = 0;
+
+        GF_ASSERT (dict);
+
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                goto out;
+        cli_out ("Memory status for volume : %s", volname);
+
+        ret = dict_get_int32 (dict, "brick-index-max", &brick_index_max);
+        if (ret)
+                goto out;
+        ret = dict_get_int32 (dict, "other-count", &other_count);
+        if (ret)
+                goto out;
+
+        index_max = brick_index_max + other_count;
+
+        for (i = 0; i <= index_max; i++) {
+                cli_out ("----------------------------------------------");
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.hostname", i);
+                ret = dict_get_str (dict, key, &hostname);
+                if (ret)
+                        continue;
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.path", i);
+                ret = dict_get_str (dict, key, &path);
+                if (ret)
+                        continue;
+                if (notbrick)
+                        cli_out ("%s : %s", hostname, path);
+                else
+                        cli_out ("Brick : %s:%s", hostname, path);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.status", i);
+                ret = dict_get_int32 (dict, key, &online);
+                if (ret)
+                        goto out;
+                if (!online) {
+                        if (notbrick)
+                                cli_out ("%s is offline", hostname);
+                        else
+                                cli_out ("Brick is offline");
+                        continue;
+                }
+
+                cli_out ("Mallinfo\n--------");
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.mallinfo.arena", i);
+                ret = dict_get_int32 (dict, key, &val);
+                if (ret)
+                        goto out;
+                cli_out ("%-8s : %d","Arena", val);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.mallinfo.ordblks", i);
+                ret = dict_get_int32 (dict, key, &val);
+                if(ret)
+                        goto out;
+                cli_out ("%-8s : %d","Ordblks", val);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.mallinfo.smblks", i);
+                ret = dict_get_int32 (dict, key, &val);
+                if(ret)
+                        goto out;
+                cli_out ("%-8s : %d","Smblks", val);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.mallinfo.hblks", i);
+                ret = dict_get_int32 (dict, key, &val);
+                if(ret)
+                        goto out;
+                cli_out ("%-8s : %d", "Hblks", val);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.mallinfo.hblkhd", i);
+                ret = dict_get_int32 (dict, key, &val);
+                if (ret)
+                        goto out;
+                cli_out ("%-8s : %d", "Hblkhd", val);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.mallinfo.usmblks", i);
+                ret = dict_get_int32 (dict, key, &val);
+                if (ret)
+                        goto out;
+                cli_out ("%-8s : %d", "Usmblks", val);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.mallinfo.fsmblks", i);
+                ret = dict_get_int32 (dict, key, &val);
+                if (ret)
+                        goto out;
+                cli_out ("%-8s : %d", "Fsmblks", val);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.mallinfo.uordblks", i);
+                ret = dict_get_int32 (dict, key, &val);
+                if (ret)
+                        goto out;
+                cli_out ("%-8s : %d", "Uordblks", val);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.mallinfo.fordblks", i);
+                ret = dict_get_int32 (dict, key, &val);
+                if (ret)
+                        goto out;
+                cli_out ("%-8s : %d", "Fordblks", val);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.mallinfo.keepcost", i);
+                ret = dict_get_int32 (dict, key, &val);
+                if (ret)
+                        goto out;
+                cli_out ("%-8s : %d", "Keepcost", val);
+
+                cli_out (" ");
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d", i);
+                cli_print_volume_status_mempool (dict, key);
+        }
+out:
+        cli_out ("----------------------------------------------\n");
+        return;
+}
+
+void
+cli_print_volume_status_clients (dict_t *dict, gf_boolean_t notbrick)
+{
+        int             ret = -1;
+        char            *volname = NULL;
+        int             brick_index_max = -1;
+        int             other_count = 0;
+        int             index_max = 0;
+        char            *hostname = NULL;
+        char            *path = NULL;
+        int             online = -1;
+        int             client_count = 0;
+        char            *clientname = NULL;
+        uint64_t        bytesread = 0;
+        uint64_t        byteswrite = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+        int             j = 0;
+
+        GF_ASSERT (dict);
+
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                goto out;
+        cli_out ("Client connections for volume %s", volname);
+
+        ret = dict_get_int32 (dict, "brick-index-max", &brick_index_max);
+        if (ret)
+                goto out;
+        ret = dict_get_int32 (dict, "other-count", &other_count);
+        if (ret)
+                goto out;
+
+        index_max = brick_index_max + other_count;
+
+        for (i = 0; i <= index_max; i++) {
+                cli_out ("----------------------------------------------");
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.hostname", i);
+                ret = dict_get_str (dict, key, &hostname);
+                if (ret)
+                        goto out;
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.path", i);
+                ret = dict_get_str (dict, key, &path);
+                if (ret)
+                        goto out;
+
+                if (notbrick)
+                        cli_out ("%s : %s", hostname, path);
+                else
+                        cli_out ("Brick : %s:%s", hostname, path);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.status", i);
+                ret = dict_get_int32 (dict, key, &online);
+                if (ret)
+                        goto out;
+                if (!online) {
+                        if (notbrick)
+                                cli_out ("%s is offline", hostname);
+                        else
+                                cli_out ("Brick is offline");
+                        continue;
+                }
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.clientcount", i);
+                ret = dict_get_int32 (dict, key, &client_count);
+                if (ret)
+                        goto out;
+
+                cli_out ("Clients connected : %d", client_count);
+                if (client_count == 0)
+                        continue;
+
+                cli_out ("%-48s %15s %15s", "Hostname", "BytesRead",
+                         "BytesWritten");
+                cli_out ("%-48s %15s %15s", "--------", "---------",
+                         "------------");
+                for (j =0; j < client_count; j++) {
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key),
+                                  "brick%d.client%d.hostname", i, j);
+                        ret = dict_get_str (dict, key, &clientname);
+                        if (ret)
+                                goto out;
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key),
+                                  "brick%d.client%d.bytesread", i, j);
+                        ret = dict_get_uint64 (dict, key, &bytesread);
+                        if (ret)
+                                goto out;
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key),
+                                 "brick%d.client%d.byteswrite", i, j);
+                        ret = dict_get_uint64 (dict, key, &byteswrite);
+                        if (ret)
+                                goto out;
+
+                        cli_out ("%-48s %15"PRIu64" %15"PRIu64,
+                                 clientname, bytesread, byteswrite);
+                }
+        }
+out:
+        cli_out ("----------------------------------------------\n");
+        return;
+}
+
+void
+cli_print_volume_status_inode_entry (dict_t *dict, char *prefix)
+{
+        int             ret = -1;
+        char            key[1024] = {0,};
+        char            *gfid = NULL;
+        uint64_t        nlookup = 0;
+        uint32_t        ref = 0;
+        int             ia_type = 0;
+        char            inode_type;
+
+        GF_ASSERT (dict);
+        GF_ASSERT (prefix);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.gfid", prefix);
+        ret = dict_get_str (dict, key, &gfid);
+        if (ret)
+                goto out;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.nlookup", prefix);
+        ret = dict_get_uint64 (dict, key, &nlookup);
+        if (ret)
+                goto out;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.ref", prefix);
+        ret = dict_get_uint32 (dict, key, &ref);
+        if (ret)
+                goto out;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.ia_type", prefix);
+        ret = dict_get_int32 (dict, key, &ia_type);
+        if (ret)
+                goto out;
+
+        switch (ia_type) {
+        case IA_IFREG:
+                inode_type = 'R';
+                break;
+        case IA_IFDIR:
+                inode_type = 'D';
+                break;
+        case IA_IFLNK:
+                inode_type = 'L';
+                break;
+        case IA_IFBLK:
+                inode_type = 'B';
+                break;
+        case IA_IFCHR:
+                inode_type = 'C';
+                break;
+        case IA_IFIFO:
+                inode_type = 'F';
+                break;
+        case IA_IFSOCK:
+                inode_type = 'S';
+                break;
+        default:
+                inode_type = 'I';
+                break;
+        }
+
+        cli_out ("%-40s %14"PRIu64" %14"PRIu32" %9c",
+                 gfid, nlookup, ref, inode_type);
+
+out:
+        return;
+
+}
+
+void
+cli_print_volume_status_itables (dict_t *dict, char *prefix)
+{
+        int             ret = -1;
+        char            key[1024] = {0,};
+        uint32_t        active_size = 0;
+        uint32_t        lru_size = 0;
+        uint32_t        purge_size = 0;
+        int             i =0;
+
+        GF_ASSERT (dict);
+        GF_ASSERT (prefix);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.active_size", prefix);
+        ret = dict_get_uint32 (dict, key, &active_size);
+        if (ret)
+                goto out;
+        if (active_size != 0) {
+                cli_out ("Active inodes:");
+                cli_out ("%-40s %14s %14s %9s", "GFID", "Lookups", "Ref",
+                         "IA type");
+                cli_out ("%-40s %14s %14s %9s", "----", "-------", "---",
+                         "-------");
+        }
+        for (i = 0; i < active_size; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.active%d", prefix, i);
+                cli_print_volume_status_inode_entry (dict, key);
+        }
+        cli_out (" ");
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.lru_size", prefix);
+        ret = dict_get_uint32 (dict, key, &lru_size);
+        if (ret)
+                goto out;
+        if (lru_size != 0) {
+                cli_out ("LRU inodes:");
+                cli_out ("%-40s %14s %14s %9s", "GFID", "Lookups", "Ref",
+                         "IA type");
+                cli_out ("%-40s %14s %14s %9s", "----", "-------", "---",
+                         "-------");
+        }
+        for (i = 0; i < lru_size; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.lru%d", prefix, i);
+                cli_print_volume_status_inode_entry (dict, key);
+        }
+        cli_out (" ");
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.purge_size", prefix);
+        ret = dict_get_uint32 (dict, key, &purge_size);
+        if (ret)
+                goto out;
+        if (purge_size != 0) {
+                cli_out ("Purged inodes:");
+                cli_out ("%-40s %14s %14s %9s", "GFID", "Lookups", "Ref",
+                         "IA type");
+                cli_out ("%-40s %14s %14s %9s", "----", "-------", "---",
+                         "-------");
+        }
+        for (i = 0; i < purge_size; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.purge%d", prefix, i);
+                cli_print_volume_status_inode_entry (dict, key);
+        }
+
+out:
+        return;
+}
+
+void
+cli_print_volume_status_inode (dict_t *dict, gf_boolean_t notbrick)
+{
+        int             ret = -1;
+        char            *volname = NULL;
+        int             brick_index_max = -1;
+        int             other_count = 0;
+        int             index_max = 0;
+        char            *hostname = NULL;
+        char            *path = NULL;
+        int             online = -1;
+        int             conn_count = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+        int             j = 0;
+
+        GF_ASSERT (dict);
+
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                goto out;
+        cli_out ("Inode tables for volume %s", volname);
+
+        ret = dict_get_int32 (dict, "brick-index-max", &brick_index_max);
+        if (ret)
+                goto out;
+        ret = dict_get_int32 (dict, "other-count", &other_count);
+        if (ret)
+                goto out;
+
+        index_max = brick_index_max + other_count;
+
+        for ( i = 0; i <= index_max; i++) {
+                cli_out ("----------------------------------------------");
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.hostname", i);
+                ret = dict_get_str (dict, key, &hostname);
+                if (ret)
+                        goto out;
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.path", i);
+                ret = dict_get_str (dict, key, &path);
+                if (ret)
+                        goto out;
+                if (notbrick)
+                        cli_out ("%s : %s", hostname, path);
+                else
+                        cli_out ("Brick : %s:%s", hostname, path);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.status", i);
+                ret = dict_get_int32 (dict, key, &online);
+                if (ret)
+                        goto out;
+                if (!online) {
+                        if (notbrick)
+                                cli_out ("%s is offline", hostname);
+                        else
+                                cli_out ("Brick is offline");
+                        continue;
+                }
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.conncount", i);
+                ret = dict_get_int32 (dict, key, &conn_count);
+                if (ret)
+                        goto out;
+
+                for (j = 0; j < conn_count; j++) {
+                        if (conn_count > 1)
+                                cli_out ("Connection %d:", j+1);
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "brick%d.conn%d.itable",
+                                  i, j);
+                        cli_print_volume_status_itables (dict, key);
+                        cli_out (" ");
+                }
+        }
+out:
+        cli_out ("----------------------------------------------");
+        return;
+}
+
+void
+cli_print_volume_status_fdtable (dict_t *dict, char *prefix)
+{
+        int             ret = -1;
+        char            key[1024] = {0,};
+        int             refcount = 0;
+        uint32_t        maxfds = 0;
+        int             firstfree = 0;
+        int             openfds = 0;
+        int             fd_pid = 0;
+        int             fd_refcount = 0;
+        int             fd_flags = 0;
+        int             i = 0;
+
+        GF_ASSERT (dict);
+        GF_ASSERT (prefix);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.refcount", prefix);
+        ret = dict_get_int32 (dict, key, &refcount);
+        if (ret)
+                goto out;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.maxfds", prefix);
+        ret = dict_get_uint32 (dict, key, &maxfds);
+        if (ret)
+                goto out;
+
+        memset (key, 0 ,sizeof (key));
+        snprintf (key, sizeof (key), "%s.firstfree", prefix);
+        ret = dict_get_int32 (dict, key, &firstfree);
+        if (ret)
+                goto out;
+
+        cli_out ("RefCount = %d  MaxFDs = %d  FirstFree = %d",
+                 refcount, maxfds, firstfree);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.openfds", prefix);
+        ret = dict_get_int32 (dict, key, &openfds);
+        if (ret)
+                goto out;
+        if (0 == openfds) {
+                cli_err ("No open fds");
+                goto out;
+        }
+
+        cli_out ("%-19s %-19s %-19s %-19s", "FD Entry", "PID",
+                 "RefCount", "Flags");
+        cli_out ("%-19s %-19s %-19s %-19s", "--------", "---",
+                 "--------", "-----");
+
+        for (i = 0; i < maxfds ; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.fdentry%d.pid", prefix, i);
+                ret = dict_get_int32 (dict, key, &fd_pid);
+                if (ret)
+                        continue;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.fdentry%d.refcount",
+                          prefix, i);
+                ret = dict_get_int32 (dict, key, &fd_refcount);
+                if (ret)
+                        continue;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.fdentry%d.flags", prefix, i);
+                ret = dict_get_int32 (dict, key, &fd_flags);
+                if (ret)
+                        continue;
+
+                cli_out ("%-19d %-19d %-19d %-19d", i, fd_pid, fd_refcount,
+                         fd_flags);
+        }
+
+out:
+        return;
+}
+
+void
+cli_print_volume_status_fd (dict_t *dict, gf_boolean_t notbrick)
+{
+        int             ret = -1;
+        char            *volname = NULL;
+        int             brick_index_max = -1;
+        int             other_count = 0;
+        int             index_max = 0;
+        char            *hostname = NULL;
+        char            *path = NULL;
+        int             online = -1;
+        int             conn_count = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+        int             j = 0;
+
+        GF_ASSERT (dict);
+
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                goto out;
+        cli_out ("FD tables for volume %s", volname);
+
+        ret = dict_get_int32 (dict, "brick-index-max", &brick_index_max);
+        if (ret)
+                goto out;
+        ret = dict_get_int32 (dict, "other-count", &other_count);
+        if (ret)
+                goto out;
+
+        index_max = brick_index_max + other_count;
+
+        for (i = 0; i <= index_max; i++) {
+                cli_out ("----------------------------------------------");
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.hostname", i);
+                ret = dict_get_str (dict, key, &hostname);
+                if (ret)
+                        goto out;
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.path", i);
+                ret = dict_get_str (dict, key, &path);
+                if (ret)
+                        goto out;
+
+                if (notbrick)
+                        cli_out ("%s : %s", hostname, path);
+                else
+                        cli_out ("Brick : %s:%s", hostname, path);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.status", i);
+                ret = dict_get_int32 (dict, key, &online);
+                if (ret)
+                        goto out;
+                if (!online) {
+                        if (notbrick)
+                                cli_out ("%s is offline", hostname);
+                        else
+                                cli_out ("Brick is offline");
+                        continue;
+                }
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.conncount", i);
+                ret = dict_get_int32 (dict, key, &conn_count);
+                if (ret)
+                        goto out;
+
+                for (j = 0; j < conn_count; j++) {
+                        cli_out ("Connection %d:", j+1);
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "brick%d.conn%d.fdtable",
+                                  i, j);
+                        cli_print_volume_status_fdtable (dict, key);
+                        cli_out (" ");
+                }
+        }
+out:
+        cli_out ("----------------------------------------------");
+        return;
+}
+
+void
+cli_print_volume_status_call_frame (dict_t *dict, char *prefix)
+{
+        int             ret = -1;
+        char            key[1024] = {0,};
+        int             ref_count = 0;
+        char            *translator = 0;
+        int             complete = 0;
+        char            *parent = NULL;
+        char            *wind_from = NULL;
+        char            *wind_to = NULL;
+        char            *unwind_from = NULL;
+        char            *unwind_to = NULL;
+
+        if (!dict || !prefix)
+                return;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.refcount", prefix);
+        ret = dict_get_int32 (dict, key, &ref_count);
+        if (ret)
+                return;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.translator", prefix);
+        ret = dict_get_str (dict, key, &translator);
+        if (ret)
+                return;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.complete", prefix);
+        ret = dict_get_int32 (dict, key, &complete);
+        if (ret)
+                return;
+
+        cli_out ("  Ref Count   = %d", ref_count);
+        cli_out ("  Translator  = %s", translator);
+        cli_out ("  Completed   = %s", (complete ? "Yes" : "No"));
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.parent", prefix);
+        ret = dict_get_str (dict, key, &parent);
+        if (!ret)
+                cli_out ("  Parent      = %s", parent);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.windfrom", prefix);
+        ret = dict_get_str (dict, key, &wind_from);
+        if (!ret)
+                cli_out ("  Wind From   = %s", wind_from);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.windto", prefix);
+        ret = dict_get_str (dict, key, &wind_to);
+        if (!ret)
+                cli_out ("  Wind To     = %s", wind_to);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.unwindfrom", prefix);
+        ret = dict_get_str (dict, key, &unwind_from);
+        if (!ret)
+                cli_out ("  Unwind From = %s", unwind_from);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.unwindto", prefix);
+        ret = dict_get_str (dict, key, &unwind_to);
+        if (!ret)
+                cli_out ("  Unwind To   = %s", unwind_to);
+}
+
+void
+cli_print_volume_status_call_stack (dict_t *dict, char *prefix)
+{
+        int             ret = -1;
+        char            key[1024] = {0,};
+        int             uid = 0;
+        int             gid = 0;
+        int             pid = 0;
+        uint64_t        unique = 0;
+        //char            *op = NULL;
+        int             count = 0;
+        int             i = 0;
+
+        if (!dict || !prefix)
+                return;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.uid", prefix);
+        ret = dict_get_int32 (dict, key, &uid);
+        if (ret)
+                return;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.gid", prefix);
+        ret = dict_get_int32 (dict, key, &gid);
+        if (ret)
+                return;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.pid", prefix);
+        ret = dict_get_int32 (dict, key, &pid);
+        if (ret)
+                return;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.unique", prefix);
+        ret = dict_get_uint64 (dict, key, &unique);
+        if (ret)
+                return;
+
+        /*
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.op", prefix);
+        ret = dict_get_str (dict, key, &op);
+        if (ret)
+                return;
+        */
+
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.count", prefix);
+        ret = dict_get_int32 (dict, key, &count);
+        if (ret)
+                return;
+
+        cli_out (" UID    : %d", uid);
+        cli_out (" GID    : %d", gid);
+        cli_out (" PID    : %d", pid);
+        cli_out (" Unique : %"PRIu64, unique);
+        //cli_out ("\tOp     : %s", op);
+        cli_out (" Frames : %d", count);
+
+        for (i = 0; i < count; i++) {
+                cli_out (" Frame %d", i+1);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.frame%d", prefix, i);
+                cli_print_volume_status_call_frame (dict, key);
+        }
+
+        cli_out (" ");
+}
+
+void
+cli_print_volume_status_callpool (dict_t *dict, gf_boolean_t notbrick)
+{
+        int             ret = -1;
+        char            *volname = NULL;
+        int             brick_index_max = -1;
+        int             other_count = 0;
+        int             index_max = 0;
+        char            *hostname = NULL;
+        char            *path = NULL;
+        int             online = -1;
+        int             call_count = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+        int             j = 0;
+
+        GF_ASSERT (dict);
+
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                goto out;
+        cli_out ("Pending calls for volume %s", volname);
+
+        ret = dict_get_int32 (dict, "brick-index-max", &brick_index_max);
+        if (ret)
+                goto out;
+        ret = dict_get_int32 (dict, "other-count", &other_count);
+        if (ret)
+                goto out;
+
+        index_max = brick_index_max + other_count;
+
+        for (i = 0; i <= index_max; i++) {
+                cli_out ("----------------------------------------------");
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.hostname", i);
+                ret = dict_get_str (dict, key, &hostname);
+                if (ret)
+                        goto out;
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.path", i);
+                ret = dict_get_str (dict, key, &path);
+                if (ret)
+                        goto out;
+
+                if (notbrick)
+                        cli_out ("%s : %s", hostname, path);
+                else
+                        cli_out ("Brick : %s:%s", hostname, path);
 
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.status", i);
+                ret = dict_get_int32 (dict, key, &online);
+                if (ret)
+                        goto out;
+                if (!online) {
+                        if (notbrick)
+                                cli_out ("%s is offline", hostname);
+                        else
+                                cli_out ("Brick is offline");
+                        continue;
+                }
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.callpool.count", i);
+                ret = dict_get_int32 (dict, key, &call_count);
+                if (ret)
+                        goto out;
+                cli_out ("Pending calls: %d", call_count);
+
+                if (0 == call_count)
+                        continue;
+
+                for (j = 0; j < call_count; j++) {
+                        cli_out ("Call Stack%d", j+1);
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key),
+                                  "brick%d.callpool.stack%d", i, j);
+                        cli_print_volume_status_call_stack (dict, key);
+                }
+        }
+
+out:
+        cli_out ("----------------------------------------------");
+        return;
+}
+
+static void
+cli_print_volume_status_tasks (dict_t *dict)
+{
+        int             ret         = -1;
+        int             i           = 0;
+        int             j           = 0;
+        int             count       = 0;
+        int             task_count  = 0;
+        int             status      = 0;
+        char           *op          = NULL;
+        char           *task_id_str = NULL;
+        char           *volname     = NULL;
+        char            key[1024]   = {0,};
+        char            task[1024]  = {0,};
+        char           *brick       = NULL;
+        char           *src_brick   = NULL;
+        char           *dest_brick  = NULL;
+
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                goto out;
+
+        ret = dict_get_int32 (dict, "tasks", &task_count);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Failed to get tasks count");
+                return;
+        }
+
+        cli_out ("Task Status of Volume %s", volname);
+        cli_print_line (CLI_BRICK_STATUS_LINE_LEN);
+
+        if (task_count == 0) {
+                cli_out ("There are no active volume tasks");
+                cli_out (" ");
+                return;
+        }
+
+        for (i = 0; i < task_count; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "task%d.type", i);
+                ret = dict_get_str(dict, key, &op);
+                if (ret)
+                        return;
+                cli_out ("%-20s : %-20s", "Task", op);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "task%d.id", i);
+                ret = dict_get_str (dict, key, &task_id_str);
+                if (ret)
+                        return;
+                cli_out ("%-20s : %-20s", "ID", task_id_str);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "task%d.status", i);
+                ret = dict_get_int32 (dict, key, &status);
+                if (ret)
+                        return;
+
+                snprintf (task, sizeof (task), "task%d", i);
+
+                /*
+                   Replace brick only has two states - In progress and Complete
+                   Ref: xlators/mgmt/glusterd/src/glusterd-replace-brick.c
+                */
+
+                if (!strcmp (op, "Replace brick")) {
+                        if (status)
+                                status = GF_DEFRAG_STATUS_COMPLETE;
+                        else
+                                status = GF_DEFRAG_STATUS_STARTED;
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "%s.src-brick", task);
+                        ret = dict_get_str (dict, key, &src_brick);
+                        if (ret)
+                                goto out;
+
+                        cli_out ("%-20s : %-20s", "Source Brick", src_brick);
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "%s.dst-brick", task);
+                        ret = dict_get_str (dict, key, &dest_brick);
+                        if (ret)
+                                goto out;
+
+                        cli_out ("%-20s : %-20s", "Destination Brick",
+                                 dest_brick);
+
+                } else if (!strcmp (op, "Remove brick")) {
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "%s.count", task);
+                        ret = dict_get_int32 (dict, key, &count);
+                        if (ret)
+                                goto out;
+
+                        cli_out ("%-20s", "Removed bricks:");
+
+                        for (j = 1; j <= count; j++) {
+                                memset (key, 0, sizeof (key));
+                                snprintf (key, sizeof (key),"%s.brick%d",
+                                          task, j);
+                                ret = dict_get_str (dict, key, &brick);
+                                if (ret)
+                                        goto out;
+
+                                cli_out ("%-20s", brick);
+                        }
+                }
+                cli_out ("%-20s : %-20s", "Status",
+                         cli_vol_task_status_str[status]);
+                cli_out (" ");
+        }
+
+out:
+        return;
+}
+
+static int
+gf_cli_status_cbk (struct rpc_req *req, struct iovec *iov,
+                      int count, void *myframe)
+{
+        int                             ret             = -1;
+        int                             brick_index_max = -1;
+        int                             other_count     = 0;
+        int                             index_max       = 0;
+        int                             i               = 0;
+        int                             pid             = -1;
+        uint32_t                        cmd             = 0;
+        gf_boolean_t                    notbrick        = _gf_false;
+        char                            key[1024]       = {0,};
+        char                           *hostname        = NULL;
+        char                           *path            = NULL;
+        char                           *volname         = NULL;
+        dict_t                         *dict            = NULL;
+        gf_cli_rsp                      rsp             = {0,};
+        cli_volume_status_t             status          = {0};
+        cli_local_t                    *local           = NULL;
+        gf_boolean_t                    wipe_local      = _gf_false;
+        char                            msg[1024]       = {0,};
 
         if (req->rpc_status == -1)
                 goto out;
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_status_volume_rsp);
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("cli", GF_LOG_ERROR, "Volume status response error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
         gf_log ("cli", GF_LOG_DEBUG, "Received response to status cmd");
 
-        if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
-        else if (rsp.op_ret)
-                cli_out ("Unable to obtain volume status information.");
+        local = ((call_frame_t *)myframe)->local;
+        if (!local) {
+                local = cli_local_get ();
+                if (!local) {
+                        ret = -1;
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to get local");
+                        goto out;
+                }
+                wipe_local = _gf_true;
+        }
+
+        if (rsp.op_ret) {
+                if (strcmp (rsp.op_errstr, ""))
+                        snprintf (msg, sizeof (msg), "%s", rsp.op_errstr);
+                else
+                        snprintf (msg, sizeof (msg), "Unable to obtain volume "
+                                  "status information.");
+
+                if (global_state->mode & GLUSTER_MODE_XML) {
+                        if (!local->all)
+                                cli_xml_output_str ("volStatus", msg,
+                                                    rsp.op_ret, rsp.op_errno,
+                                                    rsp.op_errstr);
+                        ret = 0;
+                        goto out;
+                }
+
+                cli_err ("%s", msg);
+                if (local && local->all) {
+                        ret = 0;
+                        cli_out (" ");
+                } else
+                        ret = -1;
+
+                goto out;
+        }
 
         dict = dict_new ();
         if (!dict)
@@ -3708,92 +6409,335 @@ gf_cli3_1_status_cbk (struct rpc_req *req, struct iovec *iov,
         if (ret)
                 goto out;
 
-
-        ret = dict_get_int32 (dict, "count", &count);
+        ret = dict_get_uint32 (dict, "cmd", &cmd);
         if (ret)
                 goto out;
 
+        if ((cmd & GF_CLI_STATUS_ALL)) {
+                if (local && local->dict) {
+                        dict_ref (dict);
+                        ret = dict_set_static_ptr (local->dict, "rsp-dict", dict);
+                        ret = 0;
+                } else {
+                        gf_log ("cli", GF_LOG_ERROR, "local not found");
+                        ret = -1;
+                }
+                goto out;
+        }
+
+        if ((cmd & GF_CLI_STATUS_NFS) || (cmd & GF_CLI_STATUS_SHD))
+                notbrick = _gf_true;
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                if (!local->all) {
+                        ret = cli_xml_output_vol_status_begin (local,
+                                                               rsp.op_ret,
+                                                               rsp.op_errno,
+                                                               rsp.op_errstr);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Error outputting to xml");
+                                goto out;
+                        }
+                }
+                if (cmd & GF_CLI_STATUS_TASKS) {
+                        ret = cli_xml_output_vol_status_tasks_detail (local,
+                                                                      dict);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,"Error outputting "
+                                        "to xml");
+                                goto out;
+                        }
+                } else {
+                        ret = cli_xml_output_vol_status (local, dict);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Error outputting to xml");
+                                goto out;
+                        }
+                }
+
+                if (!local->all) {
+                        ret = cli_xml_output_vol_status_end (local);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Error outputting to xml");
+                        }
+                }
+                goto out;
+        }
+
+        status.brick = GF_CALLOC (1, PATH_MAX + 256, gf_common_mt_strdup);
+
+        switch (cmd & GF_CLI_STATUS_MASK) {
+                case GF_CLI_STATUS_MEM:
+                        cli_print_volume_status_mem (dict, notbrick);
+                        goto cont;
+                        break;
+                case GF_CLI_STATUS_CLIENTS:
+                        cli_print_volume_status_clients (dict, notbrick);
+                        goto cont;
+                        break;
+                case GF_CLI_STATUS_INODE:
+                        cli_print_volume_status_inode (dict, notbrick);
+                        goto cont;
+                        break;
+                case GF_CLI_STATUS_FD:
+                        cli_print_volume_status_fd (dict, notbrick);
+                        goto cont;
+                        break;
+                case GF_CLI_STATUS_CALLPOOL:
+                        cli_print_volume_status_callpool (dict, notbrick);
+                        goto cont;
+                        break;
+                case GF_CLI_STATUS_TASKS:
+                        cli_print_volume_status_tasks (dict);
+                        goto cont;
+                        break;
+                default:
+                        break;
+        }
+
         ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                goto out;
+
+        ret = dict_get_int32 (dict, "brick-index-max", &brick_index_max);
+        if (ret)
+                goto out;
+
+        ret = dict_get_int32 (dict, "other-count", &other_count);
+        if (ret)
+                goto out;
+
+        index_max = brick_index_max + other_count;
+
+
+        cli_out ("Status of volume: %s", volname);
+
+        if ((cmd & GF_CLI_STATUS_DETAIL) == 0) {
+                cli_out ("Gluster process\t\t\t\t\t\tPort\tOnline\tPid");
+                cli_print_line (CLI_BRICK_STATUS_LINE_LEN);
+        }
+
+        for (i = 0; i <= index_max; i++) {
+
 
-        cli_out ("Brick status for volume: %s", volname);
-        cli_out ("Brick\t\t\t\t\t\t\tPort\tOnline\tPID");
-        for (i = 0; i < count; i++) {
                 memset (key, 0, sizeof (key));
                 snprintf (key, sizeof (key), "brick%d.hostname", i);
                 ret = dict_get_str (dict, key, &hostname);
                 if (ret)
-                        goto out;
+                        continue;
 
                 memset (key, 0, sizeof (key));
                 snprintf (key, sizeof (key), "brick%d.path", i);
                 ret = dict_get_str (dict, key, &path);
                 if (ret)
-                        goto out;
+                        continue;
+
+                /* Brick/not-brick is handled seperately here as all
+                 * types of nodes are contained in the default output
+                 */
+                memset (status.brick, 0, PATH_MAX + 255);
+                if (!strcmp (hostname, "NFS Server") ||
+                    !strcmp (hostname, "Self-heal Daemon"))
+                        snprintf (status.brick, PATH_MAX + 255, "%s on %s",
+                                  hostname, path);
+                else
+                        snprintf (status.brick, PATH_MAX + 255, "Brick %s:%s",
+                                  hostname, path);
 
                 memset (key, 0, sizeof (key));
                 snprintf (key, sizeof (key), "brick%d.port", i);
-                ret = dict_get_int32 (dict, key, &port);
+                ret = dict_get_int32 (dict, key, &(status.port));
                 if (ret)
-                        goto out;
+                        continue;
 
                 memset (key, 0, sizeof (key));
                 snprintf (key, sizeof (key), "brick%d.status", i);
-                ret = dict_get_int32 (dict, key, &online);
+                ret = dict_get_int32 (dict, key, &(status.online));
                 if (ret)
-                        goto out;
+                        continue;
 
                 memset (key, 0, sizeof (key));
                 snprintf (key, sizeof (key), "brick%d.pid", i);
                 ret = dict_get_int32 (dict, key, &pid);
+                if (ret)
+                        continue;
+                if (pid == -1)
+                        ret = gf_asprintf (&(status.pid_str), "%s", "N/A");
+                else
+                        ret = gf_asprintf (&(status.pid_str), "%d", pid);
 
-                snprintf (brick, sizeof (brick) -1, "%s:%s", hostname, path);
+                if (ret == -1)
+                        goto out;
 
-                cli_print_line (CLI_BRICK_STATUS_LINE_LEN);
-                cli_print_brick_status (brick, port, online, pid);
+                if ((cmd & GF_CLI_STATUS_DETAIL)) {
+                        ret = cli_get_detail_status (dict, i, &status);
+                        if (ret)
+                                goto out;
+                        cli_print_line (CLI_BRICK_STATUS_LINE_LEN);
+                        cli_print_detailed_status (&status);
+
+                } else {
+                        cli_print_brick_status (&status);
+                }
         }
+        cli_out (" ");
 
+        if ((cmd & GF_CLI_STATUS_MASK) == GF_CLI_STATUS_NONE)
+                cli_print_volume_status_tasks (dict);
+cont:
         ret = rsp.op_ret;
 
- out:
+out:
+        if (dict)
+                dict_unref (dict);
+        GF_FREE (status.brick);
+        if (local && wipe_local) {
+                cli_local_wipe (local);
+        }
+
         cli_cmd_broadcast_response (ret);
         return ret;
 }
 
 int32_t
-gf_cli3_1_status_volume (call_frame_t *frame, xlator_t *this,
-                            void *data)
+gf_cli_status_volume (call_frame_t *frame, xlator_t *this,
+                         void *data)
 {
-        gf1_cli_status_volume_req       req  = {0,};
-        int                             ret  = 0;
-        dict_t                          *dict = NULL;
+        gf_cli_req                      req  = {{0,}};
+        int                             ret  = -1;
+        dict_t                         *dict = NULL;
 
-        if (!frame || !this || !data) {
-                ret = -1;
+        if (!frame || !this || !data)
                 goto out;
-        }
 
         dict = data;
 
-        ret = dict_get_str (dict, "volname", &req.volname);
+        ret = cli_to_glusterd (&req, frame, gf_cli_status_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_STATUS_VOLUME, this, cli_rpc_prog,
+                               NULL);
+ out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning: %d", ret);
+        return ret;
+}
+
+int
+gf_cli_status_volume_all (call_frame_t *frame, xlator_t *this, void *data)
+{
+        int              i            = 0;
+        int              ret          = -1;
+        int              vol_count    = -1;
+        uint32_t         cmd          = 0;
+        char             key[1024]    = {0};
+        char            *volname      = NULL;
+        void            *vol_dict     = NULL;
+        dict_t          *dict         = NULL;
+        cli_local_t     *local        = NULL;
+
+        if (frame->local) {
+                local = frame->local;
+                local->all = _gf_true;
+        } else
+                goto out;
+
+        ret = dict_get_uint32 (local->dict, "cmd", &cmd);
         if (ret)
                 goto out;
 
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_STATUS_VOLUME, NULL,
-                              this, gf_cli3_1_status_cbk,
-                              (xdrproc_t)xdr_gf1_cli_status_volume_req);
+
+        ret = gf_cli_status_volume (frame, this, data);
+        if (ret)
+                goto out;
+
+        ret = dict_get_ptr (local->dict, "rsp-dict", &vol_dict);
+        if (ret)
+                goto out;
+
+        ret = dict_get_int32 ((dict_t *)vol_dict, "vol_count", &vol_count);
+        if (ret) {
+                cli_err ("Failed to get names of volumes");
+                goto out;
+        }
+
+        /* remove the "all" flag in cmd */
+        cmd &= ~GF_CLI_STATUS_ALL;
+        cmd |= GF_CLI_STATUS_VOL;
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                //TODO: Pass proper op_* values
+                ret = cli_xml_output_vol_status_begin (local, 0,0, NULL);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                        goto out;
+                }
+        }
+
+        if (vol_count == 0 && !(global_state->mode & GLUSTER_MODE_XML)) {
+                cli_err ("No volumes present");
+                ret = 0;
+                goto out;
+        }
+
+        for (i = 0; i < vol_count; i++) {
+
+                dict = dict_new ();
+                if (!dict)
+                        goto out;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "vol%d", i);
+                ret = dict_get_str (vol_dict, key, &volname);
+                if (ret)
+                        goto out;
+
+                ret = dict_set_str (dict, "volname", volname);
+                if (ret)
+                        goto out;
+
+                ret = dict_set_uint32 (dict, "cmd", cmd);
+                if (ret)
+                        goto out;
+
+                ret = gf_cli_status_volume (frame, this, dict);
+                if (ret)
+                        goto out;
+
+                dict_unref (dict);
+        }
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_vol_status_end (local);
+        }
 
  out:
-        gf_log ("cli", GF_LOG_DEBUG, "Returning: %d", ret);
+        if (ret)
+                gf_log ("cli", GF_LOG_ERROR, "status all failed");
+
+        if (vol_dict)
+                dict_unref (vol_dict);
+
+        if (ret && dict)
+                dict_unref (dict);
+
+        if (local)
+                cli_local_wipe (local);
+
+        if (frame)
+                frame->local = NULL;
+
         return ret;
 }
 
 static int
-gf_cli3_1_mount_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_mount_cbk (struct rpc_req *req, struct iovec *iov,
                   int count, void *myframe)
 {
         gf1_cli_mount_rsp rsp   = {0,};
-        int               ret   = 0;
+        int               ret   = -1;
 
         if (-1 == req->rpc_status) {
                 goto out;
@@ -3801,7 +6745,8 @@ gf_cli3_1_mount_cbk (struct rpc_req *req, struct iovec *iov,
 
         ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_mount_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
@@ -3812,9 +6757,9 @@ gf_cli3_1_mount_cbk (struct rpc_req *req, struct iovec *iov,
                 cli_out ("%s", rsp.path);
         } else {
                 /* weird sounding but easy to parse... */
-                cli_out ("%d : failed with this errno (%s)",
+                cli_err ("%d : failed with this errno (%s)",
                          rsp.op_errno, strerror (rsp.op_errno));
-                ret = 1;
+                ret = -1;
         }
 
 out:
@@ -3823,7 +6768,7 @@ out:
 }
 
 int32_t
-gf_cli3_1_mount (call_frame_t *frame, xlator_t *this, void *data)
+gf_cli_mount (call_frame_t *frame, xlator_t *this, void *data)
 {
         gf1_cli_mount_req  req  = {0,};
         int                ret  = -1;
@@ -3839,7 +6784,7 @@ gf_cli3_1_mount (call_frame_t *frame, xlator_t *this, void *data)
 
         req.label = label;
         ret = dict_allocate_and_serialize (dict, &req.dict.dict_val,
-                                           (size_t *)&req.dict.dict_len);
+                                           &req.dict.dict_len);
         if (ret) {
                 ret = -1;
                 goto out;
@@ -3847,7 +6792,7 @@ gf_cli3_1_mount (call_frame_t *frame, xlator_t *this, void *data)
 
         ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
                               GLUSTER_CLI_MOUNT, NULL,
-                              this, gf_cli3_1_mount_cbk,
+                              this, gf_cli_mount_cbk,
                               (xdrproc_t)xdr_gf1_cli_mount_req);
 
 out:
@@ -3856,11 +6801,11 @@ out:
 }
 
 static int
-gf_cli3_1_umount_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_umount_cbk (struct rpc_req *req, struct iovec *iov,
                    int count, void *myframe)
 {
         gf1_cli_umount_rsp rsp   = {0,};
-        int               ret   = 0;
+        int               ret   = -1;
 
         if (-1 == req->rpc_status) {
                 goto out;
@@ -3868,7 +6813,8 @@ gf_cli3_1_umount_cbk (struct rpc_req *req, struct iovec *iov,
 
         ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_umount_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
@@ -3877,8 +6823,8 @@ gf_cli3_1_umount_cbk (struct rpc_req *req, struct iovec *iov,
         if (rsp.op_ret == 0)
                 ret = 0;
         else {
-                cli_out ("umount failed");
-                ret = 1;
+                cli_err ("umount failed");
+                ret = -1;
         }
 
 out:
@@ -3887,7 +6833,7 @@ out:
 }
 
 int32_t
-gf_cli3_1_umount (call_frame_t *frame, xlator_t *this, void *data)
+gf_cli_umount (call_frame_t *frame, xlator_t *this, void *data)
 {
         gf1_cli_umount_req  req  = {0,};
         int                ret  = -1;
@@ -3909,7 +6855,7 @@ gf_cli3_1_umount (call_frame_t *frame, xlator_t *this, void *data)
 
         ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
                               GLUSTER_CLI_UMOUNT, NULL,
-                              this, gf_cli3_1_umount_cbk,
+                              this, gf_cli_umount_cbk,
                               (xdrproc_t)xdr_gf1_cli_umount_req);
 
  out:
@@ -3917,110 +6863,453 @@ gf_cli3_1_umount (call_frame_t *frame, xlator_t *this, void *data)
         return ret;
 }
 
+void
+cmd_heal_volume_statistics_out (dict_t *dict, int  brick)
+{
+
+        uint64_t        num_entries = 0;
+        int             ret = 0;
+        char            key[256] = {0};
+        char            *hostname = NULL;
+        uint64_t        i = 0;
+        uint64_t        healed_count = 0;
+        uint64_t        split_brain_count = 0;
+        uint64_t        heal_failed_count = 0;
+        char            *start_time_str = NULL;
+        char            *end_time_str = NULL;
+        char            *crawl_type = NULL;
+        int             progress = -1;
+
+        snprintf (key, sizeof key, "%d-hostname", brick);
+        ret = dict_get_str (dict, key, &hostname);
+        if (ret)
+                goto out;
+        cli_out ("------------------------------------------------");
+        cli_out ("\nCrawl statistics for brick no %d", brick);
+        cli_out ("Hostname of brick %s", hostname);
+
+        snprintf (key, sizeof key, "statistics-%d-count", brick);
+        ret = dict_get_uint64 (dict, key, &num_entries);
+        if (ret)
+                goto out;
+
+        for (i = 0; i < num_entries; i++)
+        {
+                snprintf (key, sizeof key, "statistics_crawl_type-%d-%"PRIu64,
+                          brick, i);
+                ret = dict_get_str (dict, key, &crawl_type);
+                if (ret)
+                        goto out;
+
+                snprintf (key, sizeof key, "statistics_healed_cnt-%d-%"PRIu64,
+                          brick,i);
+                ret = dict_get_uint64 (dict, key, &healed_count);
+                if (ret)
+                        goto out;
+
+                snprintf (key, sizeof key, "statistics_sb_cnt-%d-%"PRIu64,
+                          brick, i);
+                ret = dict_get_uint64 (dict, key, &split_brain_count);
+                if (ret)
+                        goto out;
+                snprintf (key, sizeof key, "statistics_heal_failed_cnt-%d-%"PRIu64,
+                          brick, i);
+                ret = dict_get_uint64 (dict, key, &heal_failed_count);
+                if (ret)
+                        goto out;
+                snprintf (key, sizeof key, "statistics_strt_time-%d-%"PRIu64,
+                          brick, i);
+                ret = dict_get_str (dict, key,  &start_time_str);
+                if (ret)
+                        goto out;
+                snprintf (key, sizeof key, "statistics_end_time-%d-%"PRIu64,
+                          brick, i);
+                ret = dict_get_str (dict, key, &end_time_str);
+                if (ret)
+                        goto out;
+                snprintf (key, sizeof key, "statistics_inprogress-%d-%"PRIu64,
+                          brick, i);
+                ret = dict_get_int32 (dict, key, &progress);
+                if (ret)
+                        goto out;
+
+                cli_out ("\nStarting time of crawl: %s", start_time_str);
+                if (progress == 1)
+                        cli_out ("Crawl is in progress");
+                else
+                        cli_out ("Ending time of crawl: %s", end_time_str);
+
+                cli_out ("Type of crawl: %s", crawl_type);
+                cli_out ("No. of entries healed: %"PRIu64,
+                         healed_count);
+                cli_out ("No. of entries in split-brain: %"PRIu64,
+                        split_brain_count);
+                cli_out ("No. of heal failed entries: %"PRIu64,
+                         heal_failed_count);
+
+        }
+
+
+out:
+        return;
+}
+
+void
+cmd_heal_volume_brick_out (dict_t *dict, int brick)
+{
+        uint64_t        num_entries = 0;
+        int             ret = 0;
+        char            key[256] = {0};
+        char           *hostname = NULL;
+        char           *path = NULL;
+        char           *status = NULL;
+        uint64_t        i = 0;
+        uint32_t        time = 0;
+        char            timestr[32] = {0};
+        char            *shd_status = NULL;
+
+        snprintf (key, sizeof key, "%d-hostname", brick);
+        ret = dict_get_str (dict, key, &hostname);
+        if (ret)
+                goto out;
+        snprintf (key, sizeof key, "%d-path", brick);
+        ret = dict_get_str (dict, key, &path);
+        if (ret)
+                goto out;
+        cli_out ("\nBrick %s:%s", hostname, path);
+
+        snprintf (key, sizeof key, "%d-status", brick);
+        ret = dict_get_str (dict, key, &status);
+        if (status && strlen (status))
+                cli_out ("Status: %s", status);
+
+        snprintf (key, sizeof key, "%d-shd-status",brick);
+        ret = dict_get_str (dict, key, &shd_status);
+
+        if(!shd_status)
+        {
+                snprintf (key, sizeof key, "%d-count", brick);
+                ret = dict_get_uint64 (dict, key, &num_entries);
+                cli_out ("Number of entries: %"PRIu64, num_entries);
+
+
+                for (i = 0; i < num_entries; i++) {
+                        snprintf (key, sizeof key, "%d-%"PRIu64, brick, i);
+                        ret = dict_get_str (dict, key, &path);
+                        if (ret)
+                                continue;
+                        time = 0;
+                        snprintf (key, sizeof key, "%d-%"PRIu64"-time",
+                                  brick, i);
+                        ret = dict_get_uint32 (dict, key, &time);
+                        if (!time) {
+                                cli_out ("%s", path);
+                        } else {
+                                gf_time_fmt (timestr, sizeof timestr,
+                                             time, gf_timefmt_FT);
+                                if (i == 0) {
+                                cli_out ("at                    path on brick");
+                                cli_out ("-----------------------------------");
+                                }
+                                cli_out ("%s %s", timestr, path);
+                        }
+                }
+        }
+
+out:
+        return;
+}
+
+
+void
+cmd_heal_volume_statistics_heal_count_out (dict_t *dict, int brick)
+{
+        uint64_t        num_entries = 0;
+        int             ret = 0;
+        char            key[256] = {0};
+        char           *hostname = NULL;
+        char           *path = NULL;
+        char           *status = NULL;
+        char            *shd_status = NULL;
+
+        snprintf (key, sizeof key, "%d-hostname", brick);
+        ret = dict_get_str (dict, key, &hostname);
+        if (ret)
+                goto out;
+        snprintf (key, sizeof key, "%d-path", brick);
+        ret = dict_get_str (dict, key, &path);
+        if (ret)
+                goto out;
+        cli_out ("\nBrick %s:%s", hostname, path);
+
+        snprintf (key, sizeof key, "%d-status", brick);
+        ret = dict_get_str (dict, key, &status);
+        if (status && strlen (status))
+                cli_out ("Status: %s", status);
+
+        snprintf (key, sizeof key, "%d-shd-status",brick);
+        ret = dict_get_str (dict, key, &shd_status);
+
+        if(!shd_status)
+        {
+                snprintf (key, sizeof key, "%d-hardlinks", brick);
+                ret = dict_get_uint64 (dict, key, &num_entries);
+                if (ret)
+                        cli_out ("No gathered input for this brick");
+                else
+                        cli_out ("Number of entries: %"PRIu64, num_entries);
+
+
+        }
+
+out:
+        return;
+}
+
+
 int
-gf_cli3_1_heal_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_heal_volume_cbk (struct rpc_req *req, struct iovec *iov,
                              int count, void *myframe)
 {
-        gf1_cli_heal_vol_rsp    rsp   = {0,};
-        int                     ret   = 0;
+        gf_cli_rsp              rsp   = {0,};
+        int                     ret   = -1;
         cli_local_t             *local = NULL;
         char                    *volname = NULL;
         call_frame_t            *frame = NULL;
+        dict_t                  *input_dict = NULL;
+        dict_t                  *dict = NULL;
+        int                     brick_count = 0;
+        int                     i = 0;
+        gf_xl_afr_op_t          heal_op = GF_AFR_OP_INVALID;
+        char                    *operation = NULL;
+        char                    *substr = NULL;
+        char                    *heal_op_str = NULL;
 
         if (-1 == req->rpc_status) {
                 goto out;
         }
 
-        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf1_cli_heal_vol_rsp);
+        frame = myframe;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log ("", GF_LOG_ERROR, "error");
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
 
-        frame = myframe;
-
-        if (frame) {
+        if (frame)
                 local = frame->local;
-                frame->local = NULL;
-        }
 
-        if (local)
-                volname = local->u.heal_vol.volname;
+        if (local) {
+                input_dict = local->dict;
+                ret = dict_get_int32 (input_dict, "heal-op",
+                                      (int32_t*)&heal_op);
+        }
+//TODO: Proper XML output
+//#if (HAVE_LIB_XML)
+//        if (global_state->mode & GLUSTER_MODE_XML) {
+//                ret = cli_xml_output_dict ("volHeal", dict, rsp.op_ret,
+//                                           rsp.op_errno, rsp.op_errstr);
+//                if (ret)
+//                        gf_log ("cli", GF_LOG_ERROR,
+//                                "Error outputting to xml");
+//                goto out;
+//        }
+//#endif
+
+        ret = dict_get_str (input_dict, "volname", &volname);
+        if (ret) {
+                gf_log (frame->this->name, GF_LOG_ERROR, "failed to get volname");
+                goto out;
+        }
 
         gf_log ("cli", GF_LOG_INFO, "Received resp to heal volume");
 
-        if (rsp.op_ret && strcmp (rsp.op_errstr, ""))
-                cli_out ("%s", rsp.op_errstr);
-        else
-                cli_out ("Starting heal on volume %s has been %s", volname,
-                        (rsp.op_ret) ? "unsuccessful": "successful");
+        switch (heal_op) {
+                case    GF_AFR_OP_HEAL_INDEX:
+                        heal_op_str = "to perform index self heal";
+                        break;
+                case    GF_AFR_OP_HEAL_FULL:
+                        heal_op_str = "to perform full self heal";
+                        break;
+                case    GF_AFR_OP_INDEX_SUMMARY:
+                        heal_op_str = "list of entries to be healed";
+                        break;
+                case    GF_AFR_OP_HEALED_FILES:
+                        heal_op_str = "list of healed entries";
+                        break;
+                case    GF_AFR_OP_HEAL_FAILED_FILES:
+                        heal_op_str = "list of heal failed entries";
+                        break;
+                case    GF_AFR_OP_SPLIT_BRAIN_FILES:
+                        heal_op_str = "list of split brain entries";
+                        break;
+                case    GF_AFR_OP_STATISTICS:
+                        heal_op_str =  "crawl statistics";
+                        break;
+                case    GF_AFR_OP_STATISTICS_HEAL_COUNT:
+                        heal_op_str = "count of entries to be healed";
+                        break;
+                case    GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+                        heal_op_str = "count of entries to be healed per replica";
+                        break;
+                case    GF_AFR_OP_INVALID:
+                        heal_op_str = "invalid heal op";
+                        break;
+        }
+
+        if ((heal_op == GF_AFR_OP_HEAL_FULL) ||
+            (heal_op == GF_AFR_OP_HEAL_INDEX)) {
+                operation = "Launching heal operation";
+                substr = "\nUse heal info commands to check status";
+        } else {
+                operation = "Gathering";
+                substr = "";
+        }
+
+        if (rsp.op_ret) {
+                if (strcmp (rsp.op_errstr, "")) {
+                        cli_err ("%s", rsp.op_errstr);
+                } else {
+                        cli_err ("%s %s on volume %s has been unsuccessful",
+                                 operation, heal_op_str, volname);
+                }
+
+                ret = rsp.op_ret;
+                goto out;
+        } else {
+                cli_out ("%s %s on volume %s has been successful %s", operation,
+                         heal_op_str, volname, substr);
+        }
+
+        ret = rsp.op_ret;
+        if ((heal_op == GF_AFR_OP_HEAL_FULL) ||
+            (heal_op == GF_AFR_OP_HEAL_INDEX))
+                goto out;
+
+        dict = dict_new ();
+        if (!dict) {
+                ret = -1;
+                goto out;
+        }
+
+        ret = dict_unserialize (rsp.dict.dict_val,
+                                rsp.dict.dict_len,
+                                &dict);
+
+        if (ret) {
+                gf_log ("", GF_LOG_ERROR,
+                                "Unable to allocate memory");
+                goto out;
+        } else {
+                dict->extra_stdfree = rsp.dict.dict_val;
+        }
+        ret = dict_get_int32 (dict, "count", &brick_count);
+        if (ret)
+                goto out;
+
+        if (!brick_count) {
+                cli_err ("All bricks of volume %s are down.", volname);
+                goto out;
+        }
+
+        switch (heal_op) {
+                case GF_AFR_OP_STATISTICS:
+                        for (i = 0; i < brick_count; i++)
+                                cmd_heal_volume_statistics_out (dict, i);
+                        break;
+                case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+                case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+                        for (i = 0; i < brick_count; i++)
+                                cmd_heal_volume_statistics_heal_count_out (dict,
+                                                                           i);
+                        break;
+                case GF_AFR_OP_INDEX_SUMMARY:
+                case GF_AFR_OP_HEALED_FILES:
+                case GF_AFR_OP_HEAL_FAILED_FILES:
+                case GF_AFR_OP_SPLIT_BRAIN_FILES:
+                        for (i = 0; i < brick_count; i++)
+                                cmd_heal_volume_brick_out (dict, i);
+                        break;
+                default:
+                        break;
+        }
 
         ret = rsp.op_ret;
 
 out:
         cli_cmd_broadcast_response (ret);
-        if (local)
-                cli_local_wipe (local);
-        if (rsp.volname)
-                free (rsp.volname);
-        if (rsp.op_errstr)
-                free (rsp.op_errstr);
+        free (rsp.op_errstr);
+        if (dict)
+                dict_unref (dict);
         return ret;
 }
 
 int32_t
-gf_cli3_1_heal_volume (call_frame_t *frame, xlator_t *this,
+gf_cli_heal_volume (call_frame_t *frame, xlator_t *this,
                          void *data)
 {
-        gf1_cli_heal_vol_req   *req = NULL;
+        gf_cli_req              req = {{0,}};
         int                     ret = 0;
-        cli_local_t             *local = NULL;
+        dict_t                  *dict = NULL;
 
         if (!frame || !this ||  !data) {
                 ret = -1;
                 goto out;
         }
 
-        req = data;
-        local = cli_local_get ();
-
-        if (local) {
-                local->u.heal_vol.volname = req->volname;
-                frame->local = local;
-        }
+        dict = data;
 
-        ret = cli_cmd_submit (req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_HEAL_VOLUME, NULL,
-                              this, gf_cli3_1_heal_volume_cbk,
-                              (xdrproc_t) xdr_gf1_cli_heal_vol_req);
+        ret = cli_to_glusterd (&req, frame, gf_cli_heal_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, dict,
+                               GLUSTER_CLI_HEAL_VOLUME, this, cli_rpc_prog,
+                               NULL);
 
 out:
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
 
+        GF_FREE (req.dict.dict_val);
+
         return ret;
 }
 
 int32_t
-gf_cli3_1_statedump_volume_cbk (struct rpc_req *req, struct iovec *iov,
+gf_cli_statedump_volume_cbk (struct rpc_req *req, struct iovec *iov,
                                 int count, void *myframe)
 {
-        gf1_cli_statedump_vol_rsp       rsp = {0,};
+        gf_cli_rsp                      rsp = {0,};
         int                             ret = -1;
+        char                            msg[1024] = {0,};
 
         if (-1 == req->rpc_status)
                 goto out;
         ret = xdr_to_generic (*iov, &rsp,
-                              (xdrproc_t)xdr_gf1_cli_statedump_vol_rsp);
+                              (xdrproc_t)xdr_gf_cli_rsp);
         if (ret < 0) {
-                gf_log (THIS->name, GF_LOG_ERROR, "XDR decoding failed");
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
                 goto out;
         }
-        gf_log ("cli", GF_LOG_DEBUG, "Recieved response to statedump");
+        gf_log ("cli", GF_LOG_DEBUG, "Received response to statedump");
         if (rsp.op_ret)
-                cli_out ("%s", rsp.op_errstr);
+                snprintf (msg, sizeof(msg), "%s", rsp.op_errstr);
         else
-                cli_out ("Volume statedump sucessful");
+                snprintf (msg, sizeof (msg), "Volume statedump successful");
 
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_str ("volStatedump", msg, rsp.op_ret,
+                                          rsp.op_errno, rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (rsp.op_ret)
+                cli_err ("volume statedump: failed: %s", msg);
+        else
+                cli_out ("volume statedump: success");
         ret = rsp.op_ret;
 
 out:
@@ -4029,14 +7318,202 @@ out:
 }
 
 int32_t
-gf_cli3_1_statedump_volume (call_frame_t *frame, xlator_t *this,
+gf_cli_statedump_volume (call_frame_t *frame, xlator_t *this,
                             void *data)
 {
-        gf1_cli_statedump_vol_req       req = {0,};
+        gf_cli_req                      req = {{0,}};
         dict_t                          *options = NULL;
+        int                             ret = -1;
+
+        if (!frame || !this || !data)
+                goto out;
+
+        options = data;
+
+        ret = cli_to_glusterd (&req, frame, gf_cli_statedump_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, options,
+                               GLUSTER_CLI_STATEDUMP_VOLUME, this, cli_rpc_prog,
+                               NULL);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+
+        GF_FREE (req.dict.dict_val);
+        return ret;
+}
+
+int32_t
+gf_cli_list_volume_cbk (struct rpc_req *req, struct iovec *iov,
+                                int count, void *myframe)
+{
+        int             ret = -1;
+        gf_cli_rsp      rsp = {0,};
+        dict_t          *dict = NULL;
+        int             vol_count = 0;;
+        char            *volname = NULL;
+        char            key[1024] = {0,};
+        int             i = 0;
+
+        if (-1 == req->rpc_status)
+                goto out;
+        ret = xdr_to_generic (*iov, &rsp,
+                              (xdrproc_t)xdr_gf_cli_rsp);
+        if (ret < 0) {
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
+                goto out;
+        }
+
+        dict = dict_new ();
+        if (!dict) {
+                ret = -1;
+                goto out;
+        }
+
+        ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len, &dict);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to allocate memory");
+                goto out;
+        }
+
+        if (global_state->mode & GLUSTER_MODE_XML) {
+                ret = cli_xml_output_vol_list (dict, rsp.op_ret, rsp.op_errno,
+                                               rsp.op_errstr);
+                if (ret)
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Error outputting to xml");
+                goto out;
+        }
+
+        if (rsp.op_ret)
+                cli_err ("%s", rsp.op_errstr);
+        else {
+                ret = dict_get_int32 (dict, "count", &vol_count);
+                if (ret)
+                        goto out;
+
+                if (vol_count == 0) {
+                        cli_err ("No volumes present in cluster");
+                        goto out;
+                }
+                for (i = 0; i < vol_count; i++) {
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "volume%d", i);
+                        ret = dict_get_str (dict, key, &volname);
+                        if (ret)
+                                goto out;
+                        cli_out ("%s", volname);
+                }
+        }
+
+        ret = rsp.op_ret;
+
+out:
+        cli_cmd_broadcast_response (ret);
+        return ret;
+}
+
+int32_t
+gf_cli_list_volume (call_frame_t *frame, xlator_t *this, void *data)
+{
+        int             ret = -1;
+        gf_cli_req      req = {{0,}};
+
+        if (!frame || !this)
+                goto out;
+
+        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
+                              GLUSTER_CLI_LIST_VOLUME, NULL,
+                              this, gf_cli_list_volume_cbk,
+                              (xdrproc_t)xdr_gf_cli_req);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int32_t
+gf_cli_clearlocks_volume_cbk (struct rpc_req *req, struct iovec *iov,
+                                  int count, void *myframe)
+{
+        gf_cli_rsp                      rsp = {0,};
+        int                             ret = -1;
+        char                            *lk_summary = NULL;
         char                            *volname = NULL;
-        char                            *option_str = NULL;
-        int                             option_cnt = 0;
+        dict_t                          *dict = NULL;
+
+        if (-1 == req->rpc_status)
+                goto out;
+        ret = xdr_to_generic (*iov, &rsp,
+                              (xdrproc_t)xdr_gf_cli_rsp);
+        if (ret < 0) {
+                gf_log (((call_frame_t *) myframe)->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
+                goto out;
+        }
+        gf_log ("cli", GF_LOG_DEBUG, "Received response to clear-locks");
+
+        if (rsp.op_ret) {
+                cli_err ("Volume clear-locks unsuccessful");
+                cli_err ("%s", rsp.op_errstr);
+
+        } else {
+                if (!rsp.dict.dict_len) {
+                        cli_err ("Possibly no locks cleared");
+                        ret = 0;
+                        goto out;
+                }
+
+                dict = dict_new ();
+
+                if (!dict) {
+                        ret = -1;
+                        goto out;
+                }
+
+                ret = dict_unserialize (rsp.dict.dict_val,
+                                        rsp.dict.dict_len,
+                                        &dict);
+
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Unable to serialize response dictionary");
+                        goto out;
+                }
+
+                ret = dict_get_str (dict, "volname", &volname);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Unable to get volname "
+                                "from dictionary");
+                        goto out;
+                }
+
+                ret = dict_get_str (dict, "lk-summary", &lk_summary);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Unable to get lock "
+                                "summary from dictionary");
+                        goto out;
+                }
+                cli_out ("Volume clear-locks successful");
+                cli_out ("%s", lk_summary);
+
+        }
+
+        ret = rsp.op_ret;
+
+out:
+        if (dict)
+                dict_unref (dict);
+        cli_cmd_broadcast_response (ret);
+        return ret;
+}
+
+int32_t
+gf_cli_clearlocks_volume (call_frame_t *frame, xlator_t *this,
+                             void *data)
+{
+        gf_cli_req                      req = {{0,}};
+        dict_t                          *options = NULL;
         int                             ret = -1;
 
         if (!frame || !this || !data)
@@ -4044,75 +7521,1153 @@ gf_cli3_1_statedump_volume (call_frame_t *frame, xlator_t *this,
 
         options = data;
 
-        ret = dict_get_str (options, "volname", &volname);
-        if (ret)
+        ret = cli_to_glusterd (&req, frame, gf_cli_clearlocks_volume_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, options,
+                               GLUSTER_CLI_CLRLOCKS_VOLUME, this, cli_rpc_prog,
+                               NULL);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+
+        GF_FREE (req.dict.dict_val);
+        return ret;
+}
+
+int32_t
+cli_snapshot_remove_reply (gf_cli_rsp *rsp, dict_t *dict, call_frame_t *frame)
+{
+        int32_t       ret        = -1;
+        char         *snap_name  = NULL;
+
+        GF_ASSERT (rsp);
+        GF_ASSERT (dict);
+        GF_ASSERT (frame);
+
+        if (rsp->op_ret) {
+                cli_err("snapshot delete: failed: %s",
+                        rsp->op_errstr ? rsp->op_errstr :
+                        "Please check log file for details");
+                ret = rsp->op_ret;
                 goto out;
-        req.volname = volname;
+        }
 
-        ret = dict_get_str (options, "options", &option_str);
-        if (ret)
+        ret = dict_get_str (dict, "snapname", &snap_name);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Failed to get snapname");
+                goto out;
+        }
+
+        cli_out ("snapshot delete: %s: snap removed successfully",
+                 snap_name);
+        ret = 0;
+
+out:
+        return ret;
+}
+
+int
+cli_snapshot_config_display (dict_t *dict, gf_cli_rsp *rsp)
+{
+        char                buf[PATH_MAX]        = "";
+        char               *volname              = NULL;
+        int                 ret                  = -1;
+        int                 config_command       = 0;
+        uint64_t            value                = 0;
+        uint64_t            hard_limit           = 0;
+        uint64_t            soft_limit           = 0;
+        uint64_t            i                    = 0;
+        uint64_t            voldisplaycount      = 0;
+
+        GF_ASSERT (dict);
+        GF_ASSERT (rsp);
+
+        if (rsp->op_ret) {
+                cli_err ("Snapshot Config : failed: %s",
+                         rsp->op_errstr ? rsp->op_errstr :
+                         "Please check log file for details");
+                ret = rsp->op_ret;
+                goto out;
+        }
+
+        ret = dict_get_int32 (dict, "config-command", &config_command);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Could not fetch config type");
+                goto out;
+        }
+
+        ret = dict_get_str (dict, "volname", &volname);
+        /* Ignore the error, as volname is optional */
+
+        if (!volname) {
+                volname = "System";
+        }
+
+        ret = dict_get_uint64 (dict, "snap-max-hard-limit", &hard_limit);
+        /* Ignore the error, as the key specified is optional */
+        ret = dict_get_uint64 (dict, "snap-max-soft-limit", &soft_limit);
+
+        if (!hard_limit && !soft_limit
+                        && config_command != GF_SNAP_CONFIG_DISPLAY) {
+                ret = -1;
+                gf_log(THIS->name, GF_LOG_ERROR,
+                       "Could not fetch config-key");
+                goto out;
+        }
+
+        switch (config_command) {
+        case GF_SNAP_CONFIG_TYPE_SET:
+                if (hard_limit && soft_limit) {
+                        cli_out ("snapshot config: snap-max-hard-limit "
+                                "& snap-max-soft-limit for system set "
+                                 "successfully");
+                } else if (hard_limit){
+                        cli_out ("snapshot config: %s "
+                                 "for snap-max-hard-limit set successfully",
+                                 volname);
+                } else if (soft_limit) {
+                        cli_out ("snapshot config: %s "
+                                 "for snap-max-soft-limit set successfully",
+                                 volname);
+                }
+                break;
+
+        case GF_SNAP_CONFIG_DISPLAY :
+                cli_out ("\nSnapshot System Configuration:");
+                ret = dict_get_uint64 (dict, "snap-max-hard-limit",
+                                       &value);
+                if (ret) {
+                         gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+                                "snap_max_hard_limit for %s", volname);
+                         ret = -1;
+                         goto out;
+                }
+                cli_out ("snap-max-hard-limit : %"PRIu64, value);
+
+                ret = dict_get_uint64 (dict, "snap-max-soft-limit",
+                                       &soft_limit);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+                               "snap-max-soft-limit for %s", volname);
+                        ret = -1;
+                        goto out;
+                }
+                cli_out ("snap-max-soft-limit : %"PRIu64"%%\n",
+                         soft_limit);
+
+                cli_out ("Snapshot Volume Configuration:");
+
+                ret = dict_get_uint64 (dict, "voldisplaycount",
+                                       &voldisplaycount);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                               "Could not fetch voldisplaycount");
+                        ret = -1;
+                        goto out;
+                }
+
+                for (i = 0; i < voldisplaycount; i++) {
+                        snprintf (buf, sizeof(buf), "volume%ld-volname", i);
+                        ret = dict_get_str (dict, buf, &volname);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+                                       " %s", buf);
+                                ret = -1;
+                                goto out;
+                        }
+                        cli_out ("\nVolume : %s", volname);
+
+                        snprintf (buf, sizeof(buf),
+                                  "volume%ld-snap-max-hard-limit", i);
+                        ret = dict_get_uint64 (dict, buf, &value);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+                                       " %s", buf);
+                                ret = -1;
+                                goto out;
+                        }
+                        cli_out ("snap-max-hard-limit : %"PRIu64, value);
+
+                        snprintf (buf, sizeof(buf),
+                                  "volume%ld-active-hard-limit", i);
+                        ret = dict_get_uint64 (dict, buf, &value);
+                        if (ret) {
+                                 gf_log ("cli", GF_LOG_ERROR, "Could not fetch"
+                                        " effective snap_max_hard_limit for "
+                                        "%s", volname);
+                                ret = -1;
+                                goto out;
+                        }
+                        cli_out ("Effective snap-max-hard-limit : %"PRIu64,
+                                 value);
+
+                        snprintf (buf, sizeof(buf),
+                                      "volume%ld-snap-max-soft-limit", i);
+                        ret = dict_get_uint64 (dict, buf, &value);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+                                       " %s", buf);
+                                ret = -1;
+                                goto out;
+                        }
+                        cli_out ("Effective snap-max-soft-limit : %"PRIu64" "
+                                 "(%"PRIu64"%%)", value, soft_limit);
+                }
+                break;
+        default :
+                break;
+        }
+
+        ret = 0;
+out:
+        return ret;
+}
+
+/* This function is used to print the volume related information
+ * of a snap.
+ *
+ * arg - 0, dict       : Response Dictionary.
+ * arg - 1, prefix str : snaplist.snap{0..}.vol{0..}.*
+ */
+int
+cli_get_each_volinfo_in_snap (dict_t *dict, char *keyprefix,
+                              gf_boolean_t snap_driven) {
+        char             key[PATH_MAX]  =  "";
+        char            *get_buffer     =  NULL;
+        int              value          =  0;
+        int              ret            = -1;
+        char             indent[5]      = "\t";
+        char            *volname        =  NULL;
+
+        GF_ASSERT (dict);
+        GF_ASSERT (keyprefix);
+
+        if (snap_driven) {
+                ret = snprintf (key, sizeof (key), "%s.volname", keyprefix);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = dict_get_str (dict, key, &get_buffer);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to get %s", key);
+                        goto out;
+                }
+                cli_out ("%s" INDENT_MAIN_HEAD "%s", indent,
+                        "Snap Volume Name", ":", get_buffer);
+
+                ret = snprintf (key, sizeof (key),
+                                "%s.origin-volname", keyprefix);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = dict_get_str (dict, key, &volname);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_WARNING, "Failed to get %s", key);
+                        cli_out ("%-12s", "Origin:");
+                }
+                cli_out ("%s" INDENT_MAIN_HEAD "%s", indent,
+                        "Origin Volume name", ":", volname);
+
+
+                ret = snprintf (key, sizeof (key), "%s.snapcount",
+                                keyprefix);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = dict_get_int32 (dict, key, &value);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to get %s", key);
+                        goto out;
+                }
+                cli_out ("%s%s %s      %s %d", indent, "Snaps taken for",
+                        volname, ":", value);
+
+                ret = snprintf (key, sizeof (key), "%s.snaps-available",
+                                keyprefix);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = dict_get_int32 (dict, key, &value);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to get %s", key);
+                        goto out;
+                }
+                cli_out ("%s%s %s  %s %d", indent, "Snaps available for",
+                         volname, ":", value);
+        }
+
+
+        ret = snprintf (key, sizeof (key), "%s.vol-status", keyprefix);
+        if (ret < 0) {
+                goto out;
+        }
+
+        ret = dict_get_str (dict, key, &get_buffer);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Failed to get %s", key);
+                goto out;
+        }
+        cli_out ("%s" INDENT_MAIN_HEAD "%s", indent, "Status",
+                 ":", get_buffer);
+out :
+        return ret;
+}
+
+/* This function is used to print snap related information
+ * arg - 0, dict       : Response dictionary.
+ * arg - 1, prefix_str : snaplist.snap{0..}.*
+ */
+int
+cli_get_volinfo_in_snap (dict_t *dict, char *keyprefix) {
+
+        char            key[PATH_MAX]   = "";
+        int             i               = 0;
+        int             volcount        = 0;
+        int             ret             = -1;
+
+        GF_ASSERT (dict);
+        GF_ASSERT (keyprefix);
+
+        ret = snprintf (key, sizeof (key), "%s.vol-count", keyprefix);
+        if (ret < 0) {
+                goto out;
+        }
+
+        ret = dict_get_int32 (dict, key, &volcount);
+        for (i = 1 ; i <= volcount ; i++) {
+                ret = snprintf (key, sizeof (key),
+                                "%s.vol%d", keyprefix, i);
+                if (ret < 0) {
+                        goto out;
+                }
+                ret = cli_get_each_volinfo_in_snap (dict, key, _gf_true);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Could not list "
+                                "details of volume in a snap");
+                        goto out;
+                }
+                cli_out (" ");
+        }
+
+out :
+        return ret;
+}
+
+int
+cli_get_each_snap_info (dict_t *dict, char *prefix_str,
+                        gf_boolean_t snap_driven) {
+        char            key_buffer[PATH_MAX] = "";
+        char           *get_buffer           = NULL;
+        int             ret                  = -1;
+        char            indent[5]            = "";
+
+        GF_ASSERT (dict);
+        GF_ASSERT (prefix_str);
+
+        if (!snap_driven)
+                strcat (indent, "\t");
+
+        ret = snprintf (key_buffer, sizeof (key_buffer), "%s.snapname",
+                        prefix_str);
+        if (ret < 0 ) {
+                goto out;
+        }
+
+        ret = dict_get_str (dict, key_buffer, &get_buffer);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to fetch snapname %s ",
+                        key_buffer);
+                goto out;
+        }
+        cli_out ("%s" INDENT_MAIN_HEAD "%s", indent, "Snapshot",
+                ":", get_buffer);
+
+        ret = snprintf (key_buffer, sizeof (key_buffer), "%s.snap-id",
+                        prefix_str);
+        if (ret < 0 ) {
+                goto out;
+        }
+
+        ret = dict_get_str (dict, key_buffer, &get_buffer);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to fetch snap-id %s ",
+                        key_buffer);
+                goto out;
+        }
+        cli_out ("%s" INDENT_MAIN_HEAD "%s", indent, "Snap UUID",
+                ":", get_buffer);
+
+        ret = snprintf (key_buffer, sizeof (key_buffer), "%s.snap-desc",
+                        prefix_str);
+        if (ret < 0 ) {
+                goto out;
+        }
+
+        ret = dict_get_str (dict, key_buffer, &get_buffer);
+        if (!ret) {
+                /* Ignore error for description */
+                cli_out ("%s" INDENT_MAIN_HEAD "%s", indent,
+                         "Description", ":", get_buffer);
+        }
+
+        ret = snprintf (key_buffer, sizeof (key_buffer), "%s.snap-time",
+                        prefix_str);
+        if (ret < 0 ) {
+                goto out;
+        }
+
+        ret = dict_get_str (dict, key_buffer, &get_buffer);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to fetch snap-time %s ",
+                        prefix_str);
+                goto out;
+        }
+        cli_out ("%s" INDENT_MAIN_HEAD "%s", indent, "Created",
+                ":", get_buffer);
+
+        if (snap_driven) {
+                cli_out ("%-12s", "Snap Volumes:\n");
+                ret = cli_get_volinfo_in_snap (dict, prefix_str);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Unable to list details "
+                                "of the snaps");
+                        goto out;
+                }
+        }
+out :
+        return ret;
+}
+
+/* This is a generic function to print snap related information.
+ * arg - 0, dict : Response Dictionary
+ */
+int
+cli_call_snapshot_info (dict_t *dict, gf_boolean_t bool_snap_driven) {
+        int             snap_count       =  0;
+        char            key[PATH_MAX]    =  "";
+        int             ret              =  -1;
+        int             i                =   0;
+
+        GF_ASSERT (dict);
+
+        ret = dict_get_int32 (dict, "snap-count", &snap_count);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to get snap-count");
+                goto out;
+        }
+
+        if (snap_count == 0) {
+                cli_out ("No snapshots present");
+        }
+
+        for (i = 1 ; i <= snap_count ; i++) {
+                ret = snprintf (key, sizeof (key), "snap%d", i);
+                if (ret < 0) {
+                        goto out;
+                }
+                ret = cli_get_each_snap_info (dict, key, bool_snap_driven);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Unable to print snap details");
+                        goto out;
+                }
+        }
+out :
+        return ret;
+}
+
+int
+cli_get_snaps_in_volume (dict_t *dict) {
+        int      ret             =  -1;
+        int      i               =  0;
+        int      count           =  0;
+        int      avail           =  0;
+        char     key[PATH_MAX]   =  "";
+        char    *get_buffer      =  NULL;
+
+        GF_ASSERT (dict);
+
+        ret = dict_get_str (dict, "origin-volname", &get_buffer);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Could not fetch origin-volname");
+                goto out;
+        }
+        cli_out (INDENT_MAIN_HEAD "%s", "Volume Name", ":", get_buffer);
+
+        ret = dict_get_int32 (dict, "snap-count", &avail);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Could not fetch snap-count");
+                goto out;
+        }
+        cli_out (INDENT_MAIN_HEAD "%d", "Snaps Taken", ":", avail);
+
+        ret = dict_get_int32 (dict, "snaps-available", &count);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Could not fetch snaps-available");
+                goto out;
+        }
+        cli_out (INDENT_MAIN_HEAD "%d", "Snaps Available", ":", count);
+
+        for (i = 1 ; i <= avail ; i++) {
+                snprintf (key, sizeof (key), "snap%d", i);
+                ret = cli_get_each_snap_info (dict, key, _gf_false);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Unable to print snap details");
+                        goto out;
+                }
+
+                ret = snprintf (key, sizeof (key), "snap%d.vol1", i);
+                if (ret < 0) {
+                        goto out;
+                }
+                ret = cli_get_each_volinfo_in_snap (dict, key, _gf_false);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Could not get volume "
+                                "related information");
+                        goto out;
+                }
+
+                cli_out (" ");
+        }
+out :
+        return ret;
+}
+
+int
+cli_snapshot_list (dict_t *dict) {
+        int     snapcount       =       0;
+        char    key[PATH_MAX]   =       "";
+        int     ret             =       -1;
+        int     i               =       0;
+        char    *get_buffer     =       NULL;
+
+        GF_ASSERT (dict);
+
+        ret = dict_get_int32 (dict, "snap-count", &snapcount);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Could not fetch snap count");
                 goto out;
-        req.options = option_str;
+        }
+
+        if (snapcount == 0) {
+                cli_out ("No snapshots present");
+        }
+
+        for (i = 1 ; i <= snapcount ; i++) {
+                ret = snprintf (key, sizeof (key), "snapname%d",i);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = dict_get_str (dict, key, &get_buffer);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Could not get %s ", key);
+                        goto out;
+                } else {
+                        cli_out ("%s", get_buffer);
+                }
+        }
+out :
+        return ret;
+}
+
+int
+cli_get_snap_volume_status (dict_t *dict, char *key_prefix)
+{
+        int             ret             =       -1;
+        char            key[PATH_MAX]   =       "";
+        char            *buffer         =       NULL;
+        int             brickcount      =       0;
+        int             i               =       0;
+        int             pid             =       0;
+
+        GF_ASSERT (dict);
+        GF_ASSERT (key_prefix);
+
+        ret = snprintf (key, sizeof (key), "%s.brickcount", key_prefix);
+        if (ret < 0) {
+                goto out;
+        }
+        ret = dict_get_int32 (dict, key, &brickcount);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Failed to fetch brickcount");
+                goto out;
+        }
+
+        for ( i = 0 ; i < brickcount ; i++ ) {
+                ret = snprintf (key, sizeof (key), "%s.brick%d.path",
+                                key_prefix, i);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = dict_get_str (dict, key, &buffer);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_INFO,
+                                "Unable to get Brick Path");
+                        continue;
+                }
+                cli_out ("\n\t%-17s %s   %s", "Brick Path", ":", buffer);
+
+                ret = snprintf (key, sizeof (key), "%s.brick%d.vgname",
+                                key_prefix, i);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = dict_get_str (dict, key, &buffer);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_INFO,
+                                "Unable to get Volume Group");
+                        cli_out ("\t%-17s %s   %s", "Volume Group", ":", "N/A");
+                } else
+                        cli_out ("\t%-17s %s   %s", "Volume Group", ":", buffer);
+
+                ret = snprintf (key, sizeof (key), "%s.brick%d.status",
+                                key_prefix, i);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = dict_get_str (dict, key, &buffer);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_INFO,
+                                "Unable to get Brick Running");
+                        cli_out ("\t%-17s %s   %s", "Brick Running", ":", "N/A");
+                } else
+                        cli_out ("\t%-17s %s   %s", "Brick Running", ":", buffer);
+
+                ret = snprintf (key, sizeof (key), "%s.brick%d.pid",
+                                key_prefix, i);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = dict_get_int32 (dict, key, &pid);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_INFO,
+                                "Unable to get pid");
+                        cli_out ("\t%-17s %s   %s", "Brick PID", ":", "N/A");
+                } else
+                        cli_out ("\t%-17s %s   %d", "Brick PID", ":", pid);
+
+                ret = snprintf (key, sizeof (key), "%s.brick%d.data",
+                                key_prefix, i);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = dict_get_str (dict, key, &buffer);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_INFO,
+                                "Unable to get Data Percent");
+                        cli_out ("\t%-17s %s   %s", "Data Percentage", ":", "N/A");
+                } else
+                        cli_out ("\t%-17s %s   %s", "Data Percentage", ":", buffer);
+
+                ret = snprintf (key, sizeof (key), "%s.brick%d.lvsize",
+                                key_prefix, i);
+                if (ret < 0) {
+                        goto out;
+                }
+                ret = dict_get_str (dict, key, &buffer);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_INFO, "Unable to get LV Size");
+                        cli_out ("\t%-17s %s   %s", "LV Size", ":", "N/A");
+                } else
+                        cli_out ("\t%-17s %s   %s", "LV Size", ":", buffer);
+
+        }
+out :
+        return ret;
+}
+
+
+
+int
+cli_get_single_snap_status (dict_t *dict, char *keyprefix)
+{
+        int             ret             =       -1;
+        char            key[PATH_MAX]   =       "";
+        int             i               =       0;
+        int             volcount        =       0;
+        char            *get_buffer     =       NULL;
+
+        GF_ASSERT (dict);
+        GF_ASSERT (keyprefix);
+
+        ret = snprintf (key, sizeof (key), "%s.snapname", keyprefix);
+        if (ret < 0) {
+                goto out;
+        }
+
+        ret = dict_get_str (dict, key, &get_buffer);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to get snapname");
+                goto out;
+        }
+        cli_out ("\nSnap Name : %s", get_buffer);
+
+        ret = snprintf (key, sizeof (key), "%s.uuid", keyprefix);
+        if (ret < 0) {
+                goto out;
+        }
+
+        ret = dict_get_str (dict, key, &get_buffer);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to get snap UUID");
+                goto out;
+        }
+        cli_out ("Snap UUID : %s", get_buffer);
+
+        ret = snprintf (key, sizeof (key), "%s.volcount", keyprefix);
+        if (ret < 0) {
+                goto out;
+        }
+
+        ret = dict_get_int32 (dict, key, &volcount);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Unable to get volume count");
+                goto out;
+        }
+
+        for (i = 0 ; i < volcount ; i++) {
+                ret = snprintf (key, sizeof (key), "%s.vol%d", keyprefix, i);
+                if (ret < 0) {
+                        goto out;
+                }
+
+                ret = cli_get_snap_volume_status (dict, key);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Could not get snap volume status");
+                        goto out;
+                }
+        }
+out :
+        return ret;
+}
+
+int
+cli_snap_status_all (dict_t *dict) {
+        int             ret             =       -1;
+        char            key[PATH_MAX]   =       "";
+        int             snapcount       =       0;
+        int             i               =       0;
+
+        GF_ASSERT (dict);
+
+        ret = dict_get_int32 (dict, "status.snapcount", &snapcount);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Could not get snapcount");
+                goto out;
+        }
+
+        if (snapcount == 0) {
+                cli_out ("No snapshots present");
+        }
+
+        for (i = 0 ; i < snapcount; i++) {
+                ret = snprintf (key, sizeof (key), "status.snap%d",i);
+                if (ret < 0) {
+                        goto out;
+                }
+                ret = cli_get_single_snap_status (dict, key);
+        }
+out:
+        return ret;
+}
+
+
+int
+cli_snapshot_status_display (dict_t *dict, gf_cli_rsp *rsp)
+{
+        char            key[PATH_MAX]   =       "";
+        int             ret             =       -1;
+        int             status_cmd      =       -1;
+
+        GF_ASSERT (dict);
+        GF_ASSERT (rsp);
+
+        if (rsp->op_ret) {
+                cli_err ("Snapshot Status : failed: %s",
+                        rsp->op_errstr ? rsp->op_errstr :
+                        "Please check log file for details");
+                ret = rsp->op_ret;
+                goto out;
+        }
+
+        ret = dict_get_int32 (dict, "cmd", &status_cmd);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Could not fetch status type");
+                goto out;
+        }
+        switch (status_cmd) {
+                case GF_SNAP_STATUS_TYPE_ALL :
+                {
+                        ret = cli_snap_status_all (dict);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+                                        "status of all snap");
+                                goto out;
+                        }
+                break;
+                }
+
+                case GF_SNAP_STATUS_TYPE_SNAP :
+                {
+                        ret = snprintf (key, sizeof (key), "status.snap0");
+                        if (ret < 0) {
+                                goto out;
+                        }
+                        ret = cli_get_single_snap_status (dict, key);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+                                        "status of snap");
+                                goto out;
+                        }
+                break;
+                }
+
+                case GF_SNAP_STATUS_TYPE_VOL :
+                {
+                        ret = cli_snap_status_all (dict);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR, "Could not fetch "
+                                        "status of snap in a volume");
+                                goto out;
+                        }
+                        break;
+                }
+                default :
+                        break;
+        }
+out :
+        return ret;
+}
+
+int
+gf_cli_snapshot_cbk (struct rpc_req *req, struct iovec *iov,
+                     int count, void *myframe)
+{
+        int                   ret                      = -1;
+        gf_cli_rsp            rsp                      = {0, };
+        dict_t               *dict                     = NULL;
+        char                 *snap_name                = NULL;
+        int32_t               type                     =  0;
+        call_frame_t         *frame                    = NULL;
+        gf_boolean_t         snap_driven               = _gf_false;
+
+        if (req->rpc_status == -1) {
+                ret = -1;
+                goto out;
+        }
+
+        frame = myframe;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_cli_rsp);
+        if (ret < 0) {
+                gf_log (frame->this->name, GF_LOG_ERROR,
+                        "Failed to decode xdr response");
+                goto out;
+        }
+
+        dict = dict_new ();
+
+        if (!dict) {
+                ret = -1;
+                goto out;
+        }
+
+        ret = dict_unserialize (rsp.dict.dict_val, rsp.dict.dict_len, &dict);
 
-        ret = dict_get_int32 (options, "option-cnt", &option_cnt);
         if (ret)
                 goto out;
-        req.option_cnt = option_cnt;
 
-        ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
-                              GLUSTER_CLI_STATEDUMP_VOLUME, NULL,
-                              this, gf_cli3_1_statedump_volume_cbk,
-                              (xdrproc_t)xdr_gf1_cli_statedump_vol_req);
+        ret = dict_get_int32 (dict, "type", &type);
+        if (ret) {
+                gf_log (frame->this->name, GF_LOG_ERROR, "failed to get type");
+                goto out;
+        }
 
+        switch (type) {
+        case GF_SNAP_OPTION_TYPE_CREATE:
+                if (rsp.op_ret) {
+                        cli_err("snapshot create: failed: %s",
+                                 rsp.op_errstr ? rsp.op_errstr :
+                                 "Please check log file for details");
+                                 ret = rsp.op_ret;
+                                 goto out;
+                }
+
+                ret = dict_get_str (dict, "snapname", &snap_name);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Failed to get snap name");
+                        goto out;
+                }
+                cli_out ("snapshot create: %s: snap created successfully",
+                         snap_name);
+                break;
+
+        case GF_SNAP_OPTION_TYPE_RESTORE:
+                /* TODO: Check if rsp.op_ret needs to be checked here. Or is
+                 * it ok to check this in the start of the function where we
+                 * get rsp.*/
+                if (rsp.op_ret) {
+                        cli_err("snapshot restore: failed: %s",
+                                 rsp.op_errstr ? rsp.op_errstr :
+                                 "Please check log file for details");
+                                 ret = rsp.op_ret;
+                                 goto out;
+                }
+
+                ret = dict_get_str (dict, "snapname", &snap_name);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Failed to get snap name");
+                        goto out;
+                }
+
+                cli_out ("Snapshot restore: %s: Snap restored "
+                                 "successfully", snap_name);
+
+                ret = 0;
+                break;
+
+        case GF_SNAP_OPTION_TYPE_INFO:
+                if (rsp.op_ret) {
+                        cli_err ("Snapshot info : failed: %s",
+                                  rsp.op_errstr ? rsp.op_errstr :
+                                  "Please check log file for details");
+                        ret = rsp.op_ret;
+                        goto out;
+                }
+
+                snap_driven = dict_get_str_boolean (dict, "snap-driven",
+                                                    _gf_false);
+                if (snap_driven == _gf_true) {
+                        ret = cli_call_snapshot_info  (dict, snap_driven);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Snapshot info failed");
+                                goto out;
+                        }
+                } else if (snap_driven == _gf_false) {
+                        ret = cli_get_snaps_in_volume (dict);
+                        if (ret) {
+                                gf_log ("cli", GF_LOG_ERROR,
+                                        "Snapshot info failed");
+                                goto out;
+                        }
+                }
+                break;
+
+        case GF_SNAP_OPTION_TYPE_CONFIG:
+                ret = cli_snapshot_config_display (dict, &rsp);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to display "
+                                "snapshot config output.");
+                        goto out;
+                }
+                break;
+
+        case GF_SNAP_OPTION_TYPE_LIST:
+                if (rsp.op_ret) {
+                        cli_err ("Snapshot list : failed: %s",
+                                 rsp.op_errstr ? rsp.op_errstr :
+                                "Please check log file for details");
+                        ret = rsp.op_ret;
+                        goto out;
+                }
+
+                ret = cli_snapshot_list (dict);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to display "
+                                "snapshot list");
+                        goto out;
+                }
+                break;
+
+        case GF_SNAP_OPTION_TYPE_DELETE:
+                ret = cli_snapshot_remove_reply (&rsp, dict, frame);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR,
+                                "Failed to delete snap");
+                        goto out;
+                }
+                break;
+
+        case GF_SNAP_OPTION_TYPE_STATUS:
+                ret = cli_snapshot_status_display (dict, &rsp);
+                if (ret) {
+                        gf_log ("cli", GF_LOG_ERROR, "Failed to display "
+                                "snapshot status output.");
+                        goto out;
+                }
+                break;
+
+        default:
+                cli_err ("Unknown command executed");
+                ret = -1;
+                goto out;
+        }
+out:
+        if (dict)
+                dict_unref (dict);
+        cli_cmd_broadcast_response (ret);
+
+        free (rsp.dict.dict_val);
+        free (rsp.op_errstr);
+
+        return ret;
+}
+
+int32_t
+gf_cli_snapshot (call_frame_t *frame, xlator_t *this,
+                 void *data)
+{
+        gf_cli_req        req     = {{0,}};
+        dict_t           *options = NULL;
+        int               ret     = -1;
+
+        if (!frame || !this || !data)
+                goto out;
+
+        options = data;
+
+        ret = cli_to_glusterd (&req, frame, gf_cli_snapshot_cbk,
+                               (xdrproc_t) xdr_gf_cli_req, options,
+                               GLUSTER_CLI_SNAP, this, cli_rpc_prog,
+                               NULL);
 out:
-        if (options)
-                dict_destroy (options);
         gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+
+        GF_FREE (req.dict.dict_val);
+        return ret;
+}
+
+int
+cli_to_glusterd (gf_cli_req *req, call_frame_t *frame,
+                 fop_cbk_fn_t cbkfn, xdrproc_t xdrproc, dict_t *dict,
+                 int procnum, xlator_t *this, rpc_clnt_prog_t *prog,
+                 struct iobref *iobref)
+{
+        int                ret = 0;
+        size_t             len = 0;
+        char               *cmd = NULL;
+        int                i = 0;
+        const char         **words = NULL;
+        cli_local_t        *local = NULL;
+
+        if (!this || !frame || !dict) {
+                ret = -1;
+                goto out;
+        }
+
+        if (!frame->local) {
+                ret = -1;
+                goto out;
+        }
+
+        local = frame->local;
+
+        if (!local->words) {
+                ret = -1;
+                goto out;
+        }
+
+        words = local->words;
+
+        while (words[i])
+                len += strlen (words[i++]) + 1;
+
+        cmd = GF_CALLOC (1, len, gf_common_mt_char);
+
+        if (!cmd) {
+                ret = -1;
+                goto out;
+        }
+
+        for (i = 0; words[i]; i++) {
+                strncat (cmd, words[i], strlen (words[i]));
+                if (words[i+1] != NULL)
+                        strncat (cmd, " ", strlen (" "));
+        }
+
+        cmd [len - 1] = '\0';
+
+        ret = dict_set_dynstr (dict, "cmd-str", cmd);
+        if (ret)
+                goto out;
+
+        ret = dict_allocate_and_serialize (dict, &(req->dict).dict_val,
+                                           &(req->dict).dict_len);
+
+        if (ret < 0) {
+                gf_log (this->name, GF_LOG_DEBUG,
+                        "failed to get serialized length of dict");
+                goto out;
+        }
+
+        ret = cli_cmd_submit (req, frame, prog, procnum, iobref, this,
+                              cbkfn, (xdrproc_t) xdrproc);
+
+out:
         return ret;
+
 }
 
 struct rpc_clnt_procedure gluster_cli_actors[GLUSTER_CLI_MAXVALUE] = {
         [GLUSTER_CLI_NULL]             = {"NULL", NULL },
-        [GLUSTER_CLI_PROBE]            = {"PROBE_QUERY", gf_cli3_1_probe},
-        [GLUSTER_CLI_DEPROBE]          = {"DEPROBE_QUERY", gf_cli3_1_deprobe},
-        [GLUSTER_CLI_LIST_FRIENDS]     = {"LIST_FRIENDS", gf_cli3_1_list_friends},
-        [GLUSTER_CLI_CREATE_VOLUME]    = {"CREATE_VOLUME", gf_cli3_1_create_volume},
-        [GLUSTER_CLI_DELETE_VOLUME]    = {"DELETE_VOLUME", gf_cli3_1_delete_volume},
-        [GLUSTER_CLI_START_VOLUME]     = {"START_VOLUME", gf_cli3_1_start_volume},
-        [GLUSTER_CLI_STOP_VOLUME]      = {"STOP_VOLUME", gf_cli3_1_stop_volume},
-        [GLUSTER_CLI_RENAME_VOLUME]    = {"RENAME_VOLUME", gf_cli3_1_rename_volume},
-        [GLUSTER_CLI_DEFRAG_VOLUME]    = {"DEFRAG_VOLUME", gf_cli3_1_defrag_volume},
-        [GLUSTER_CLI_GET_VOLUME]       = {"GET_VOLUME", gf_cli3_1_get_volume},
-        [GLUSTER_CLI_GET_NEXT_VOLUME]  = {"GET_NEXT_VOLUME", gf_cli3_1_get_next_volume},
-        [GLUSTER_CLI_SET_VOLUME]       = {"SET_VOLUME", gf_cli3_1_set_volume},
-        [GLUSTER_CLI_ADD_BRICK]        = {"ADD_BRICK", gf_cli3_1_add_brick},
-        [GLUSTER_CLI_REMOVE_BRICK]     = {"REMOVE_BRICK", gf_cli3_1_remove_brick},
-        [GLUSTER_CLI_REPLACE_BRICK]    = {"REPLACE_BRICK", gf_cli3_1_replace_brick},
-        [GLUSTER_CLI_LOG_FILENAME]     = {"LOG FILENAME", gf_cli3_1_log_filename},
-        [GLUSTER_CLI_LOG_LOCATE]       = {"LOG LOCATE", gf_cli3_1_log_locate},
-        [GLUSTER_CLI_LOG_ROTATE]       = {"LOG ROTATE", gf_cli3_1_log_rotate},
-        [GLUSTER_CLI_GETSPEC]          = {"GETSPEC", gf_cli3_1_getspec},
-        [GLUSTER_CLI_PMAP_PORTBYBRICK] = {"PMAP PORTBYBRICK", gf_cli3_1_pmap_b2p},
-        [GLUSTER_CLI_SYNC_VOLUME]      = {"SYNC_VOLUME", gf_cli3_1_sync_volume},
-        [GLUSTER_CLI_RESET_VOLUME]     = {"RESET_VOLUME", gf_cli3_1_reset_volume},
-        [GLUSTER_CLI_FSM_LOG]          = {"FSM_LOG", gf_cli3_1_fsm_log},
-        [GLUSTER_CLI_GSYNC_SET]        = {"GSYNC_SET", gf_cli3_1_gsync_set},
-        [GLUSTER_CLI_PROFILE_VOLUME]   = {"PROFILE_VOLUME", gf_cli3_1_profile_volume},
-        [GLUSTER_CLI_QUOTA]            = {"QUOTA", gf_cli3_1_quota},
-        [GLUSTER_CLI_TOP_VOLUME]       = {"TOP_VOLUME", gf_cli3_1_top_volume},
-        [GLUSTER_CLI_LOG_LEVEL]        = {"VOLUME_LOGLEVEL", gf_cli3_1_log_level},
-        [GLUSTER_CLI_GETWD]            = {"GETWD", gf_cli3_1_getwd},
-        [GLUSTER_CLI_STATUS_VOLUME]    = {"STATUS_VOLUME", gf_cli3_1_status_volume},
-        [GLUSTER_CLI_MOUNT]            = {"MOUNT", gf_cli3_1_mount},
-        [GLUSTER_CLI_UMOUNT]           = {"UMOUNT", gf_cli3_1_umount},
-        [GLUSTER_CLI_HEAL_VOLUME]      = {"HEAL_VOLUME", gf_cli3_1_heal_volume},
-        [GLUSTER_CLI_STATEDUMP_VOLUME] = {"STATEDUMP_VOLUME", gf_cli3_1_statedump_volume},
+        [GLUSTER_CLI_PROBE]            = {"PROBE_QUERY", gf_cli_probe},
+        [GLUSTER_CLI_DEPROBE]          = {"DEPROBE_QUERY", gf_cli_deprobe},
+        [GLUSTER_CLI_LIST_FRIENDS]     = {"LIST_FRIENDS", gf_cli_list_friends},
+        [GLUSTER_CLI_UUID_RESET]       = {"UUID_RESET", gf_cli3_1_uuid_reset},
+        [GLUSTER_CLI_UUID_GET]       = {"UUID_GET", gf_cli3_1_uuid_get},
+        [GLUSTER_CLI_CREATE_VOLUME]    = {"CREATE_VOLUME", gf_cli_create_volume},
+        [GLUSTER_CLI_DELETE_VOLUME]    = {"DELETE_VOLUME", gf_cli_delete_volume},
+        [GLUSTER_CLI_START_VOLUME]     = {"START_VOLUME", gf_cli_start_volume},
+        [GLUSTER_CLI_STOP_VOLUME]      = {"STOP_VOLUME", gf_cli_stop_volume},
+        [GLUSTER_CLI_RENAME_VOLUME]    = {"RENAME_VOLUME", gf_cli_rename_volume},
+        [GLUSTER_CLI_DEFRAG_VOLUME]    = {"DEFRAG_VOLUME", gf_cli_defrag_volume},
+        [GLUSTER_CLI_GET_VOLUME]       = {"GET_VOLUME", gf_cli_get_volume},
+        [GLUSTER_CLI_GET_NEXT_VOLUME]  = {"GET_NEXT_VOLUME", gf_cli_get_next_volume},
+        [GLUSTER_CLI_SET_VOLUME]       = {"SET_VOLUME", gf_cli_set_volume},
+        [GLUSTER_CLI_ADD_BRICK]        = {"ADD_BRICK", gf_cli_add_brick},
+        [GLUSTER_CLI_REMOVE_BRICK]     = {"REMOVE_BRICK", gf_cli_remove_brick},
+        [GLUSTER_CLI_REPLACE_BRICK]    = {"REPLACE_BRICK", gf_cli_replace_brick},
+        [GLUSTER_CLI_LOG_ROTATE]       = {"LOG ROTATE", gf_cli_log_rotate},
+        [GLUSTER_CLI_GETSPEC]          = {"GETSPEC", gf_cli_getspec},
+        [GLUSTER_CLI_PMAP_PORTBYBRICK] = {"PMAP PORTBYBRICK", gf_cli_pmap_b2p},
+        [GLUSTER_CLI_SYNC_VOLUME]      = {"SYNC_VOLUME", gf_cli_sync_volume},
+        [GLUSTER_CLI_RESET_VOLUME]     = {"RESET_VOLUME", gf_cli_reset_volume},
+        [GLUSTER_CLI_FSM_LOG]          = {"FSM_LOG", gf_cli_fsm_log},
+        [GLUSTER_CLI_GSYNC_SET]        = {"GSYNC_SET", gf_cli_gsync_set},
+        [GLUSTER_CLI_PROFILE_VOLUME]   = {"PROFILE_VOLUME", gf_cli_profile_volume},
+        [GLUSTER_CLI_QUOTA]            = {"QUOTA", gf_cli_quota},
+        [GLUSTER_CLI_TOP_VOLUME]       = {"TOP_VOLUME", gf_cli_top_volume},
+        [GLUSTER_CLI_GETWD]            = {"GETWD", gf_cli_getwd},
+        [GLUSTER_CLI_STATUS_VOLUME]    = {"STATUS_VOLUME", gf_cli_status_volume},
+        [GLUSTER_CLI_STATUS_ALL]       = {"STATUS_ALL", gf_cli_status_volume_all},
+        [GLUSTER_CLI_MOUNT]            = {"MOUNT", gf_cli_mount},
+        [GLUSTER_CLI_UMOUNT]           = {"UMOUNT", gf_cli_umount},
+        [GLUSTER_CLI_HEAL_VOLUME]      = {"HEAL_VOLUME", gf_cli_heal_volume},
+        [GLUSTER_CLI_STATEDUMP_VOLUME] = {"STATEDUMP_VOLUME", gf_cli_statedump_volume},
+        [GLUSTER_CLI_LIST_VOLUME]      = {"LIST_VOLUME", gf_cli_list_volume},
+        [GLUSTER_CLI_CLRLOCKS_VOLUME]  = {"CLEARLOCKS_VOLUME", gf_cli_clearlocks_volume},
+        [GLUSTER_CLI_COPY_FILE]        = {"COPY_FILE", gf_cli_copy_file},
+        [GLUSTER_CLI_SYS_EXEC]         = {"SYS_EXEC", gf_cli_sys_exec},
+        [GLUSTER_CLI_SNAP]             = {"SNAP", gf_cli_snapshot},
 };
 
 struct rpc_clnt_program cli_prog = {
         .progname  = "Gluster CLI",
         .prognum   = GLUSTER_CLI_PROGRAM,
         .progver   = GLUSTER_CLI_VERSION,
-        .numproc   = GLUSTER_CLI_PROCCNT,
+        .numproc   = GLUSTER_CLI_MAXVALUE,
         .proctable = gluster_cli_actors,
 };
diff --git a/cli/src/cli-xml-output.c b/cli/src/cli-xml-output.c
new file mode 100644
index 000000000..d8884d44b
--- /dev/null
+++ b/cli/src/cli-xml-output.c
@@ -0,0 +1,3772 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <stdlib.h>
+#include "cli.h"
+#include "cli1-xdr.h"
+#include "run.h"
+#include "compat.h"
+#include "syscall.h"
+
+
+enum gf_task_types {
+    GF_TASK_TYPE_REBALANCE,
+    GF_TASK_TYPE_REMOVE_BRICK
+};
+
+/*
+ * IMPORTANT NOTE:
+ * All exported functions in this file which use libxml need use a
+ * #if (HAVE_LIB_XML), #else, #endif
+ * For eg,
+ *      int exported_func () {
+ *              #if (HAVE_LIB_XML)
+ *                      <Stuff using libxml>
+ *              #else
+ *                      return 0;
+ *              #endif
+ *      }
+ *
+ *  All other functions, which are called internally within this file need to be
+ *  within #if (HAVE_LIB_XML), #endif statements
+ *  For eg,
+ *      #if (HAVE_LIB_XML)
+ *      int internal_func ()
+ *      {
+ *      }
+ *      #endif
+ *
+ *  Following the above formate ensures that all xml related code is compliled
+ *  only when libxml2 is present, and also keeps the rest of the codebase free
+ *  of #if (HAVE_LIB_XML)
+ */
+
+
+#if (HAVE_LIB_XML)
+
+#include <libxml/encoding.h>
+#include <libxml/xmlwriter.h>
+
+#define XML_RET_CHECK_AND_GOTO(ret, label)      do {            \
+                if (ret < 0) {                                  \
+                        ret = -1;                               \
+                        goto label;                             \
+                }                                               \
+                else                                            \
+                        ret = 0;                                \
+        }while (0)                                              \
+
+int
+cli_begin_xml_output (xmlTextWriterPtr *writer, xmlDocPtr *doc)
+{
+        int             ret = -1;
+
+        *writer = xmlNewTextWriterDoc (doc, 0);
+        if (writer == NULL) {
+                ret = -1;
+                goto out;
+        }
+
+        ret = xmlTextWriterStartDocument (*writer, "1.0", "UTF-8", "yes");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* <cliOutput> */
+        ret = xmlTextWriterStartElement (*writer, (xmlChar *)"cliOutput");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_end_xml_output (xmlTextWriterPtr writer, xmlDocPtr doc)
+{
+        int             ret = -1;
+
+        /* </cliOutput> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterEndDocument (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+
+        /* Dump xml document to stdout and pretty format it */
+        xmlSaveFormatFileEnc ("-", doc, "UTF-8", 1);
+
+        xmlFreeTextWriter (writer);
+        xmlFreeDoc (doc);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_common (xmlTextWriterPtr writer, int op_ret, int op_errno,
+                       char *op_errstr)
+{
+        int             ret = -1;
+
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"opRet",
+                                               "%d", op_ret);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"opErrno",
+                                               "%d", op_errno);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"opErrstr",
+                                                "%s", op_errstr);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+#endif
+
+int
+cli_xml_output_str (char *op, char *str, int op_ret, int op_errno,
+                    char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        if (op) {
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"cliOp",
+                                                       "%s", op);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        if (str) {
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"output",
+                                                       "%s", str);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        ret = cli_end_xml_output (writer, doc);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+#if (HAVE_LIB_XML)
+int
+cli_xml_output_data_pair (dict_t *this, char *key, data_t *value,
+                          void *data)
+{
+        int                     ret = -1;
+        xmlTextWriterPtr        *writer = NULL;
+
+        writer = (xmlTextWriterPtr *)data;
+
+        ret = xmlTextWriterWriteFormatElement (*writer, (xmlChar *)key,
+                                               "%s", value->data);
+
+        return ret;
+}
+#endif
+
+int
+cli_xml_output_dict ( char *op, dict_t *dict, int op_ret, int op_errno,
+                      char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        /* <"op"> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)op);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        if (dict)
+                dict_foreach (dict, cli_xml_output_data_pair, &writer);
+
+        /* </"op"> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = cli_end_xml_output (writer, doc);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+#if (HAVE_LIB_XML)
+int
+cli_xml_output_vol_status_common (xmlTextWriterPtr writer, dict_t *dict,
+                                  int   brick_index, int *online,
+                                  gf_boolean_t *node_present)
+{
+        int             ret = -1;
+        char            *hostname = NULL;
+        char            *path = NULL;
+        int             port = 0;
+        int             status = 0;
+        int             pid = 0;
+        char            key[1024] = {0,};
+
+        snprintf (key, sizeof (key), "brick%d.hostname", brick_index);
+        ret = dict_get_str (dict, key, &hostname);
+        if (ret) {
+                *node_present = _gf_false;
+                goto out;
+        }
+        *node_present = _gf_true;
+
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"hostname",
+                                               "%s", hostname);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.path", brick_index);
+        ret = dict_get_str (dict, key, &path);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"path",
+                                               "%s", path);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.status", brick_index);
+        ret = dict_get_int32 (dict, key, &status);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"status",
+                                               "%d", status);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+        *online = status;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.port", brick_index);
+        ret = dict_get_int32 (dict, key, &port);
+        if (ret)
+                goto out;
+
+        /* If the process is either offline or doesn't provide a port (shd)
+         * port = "N/A"
+         * else print the port number of the process.
+         */
+
+        if (*online == 1 && port != 0)
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"port",
+                                                       "%d", port);
+        else
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"port",
+                                                       "%s", "N/A");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.pid", brick_index);
+        ret = dict_get_int32 (dict, key, &pid);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"pid",
+                                               "%d", pid);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_detail (xmlTextWriterPtr writer, dict_t *dict,
+                                  int brick_index)
+{
+        int             ret = -1;
+        uint64_t        size_total = 0;
+        uint64_t        size_free = 0;
+        char            *device = NULL;
+        uint64_t        block_size = 0;
+        char            *mnt_options = NULL;
+        char            *fs_name = NULL;
+        char            *inode_size = NULL;
+        uint64_t        inodes_total = 0;
+        uint64_t        inodes_free = 0;
+        char            key[1024] = {0,};
+
+        snprintf (key, sizeof (key), "brick%d.total", brick_index);
+        ret = dict_get_uint64 (dict, key, &size_total);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"sizeTotal",
+                                               "%"PRIu64, size_total);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.free", brick_index);
+        ret = dict_get_uint64 (dict, key, &size_free);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"sizeFree",
+                                               "%"PRIu64, size_free);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.device", brick_index);
+        ret = dict_get_str (dict, key, &device);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"device",
+                                               "%s", device);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.block_size", brick_index);
+        ret = dict_get_uint64 (dict, key, &block_size);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"blockSize",
+                                               "%"PRIu64, block_size);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.mnt_options", brick_index);
+        ret = dict_get_str (dict, key, &mnt_options);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"mntOptions",
+                                               "%s", mnt_options);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.fs_name", brick_index);
+        ret = dict_get_str (dict, key, &fs_name);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"fsName",
+                                               "%s", fs_name);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* inode details are only available for ext 2/3/4 & xfs */
+        if (!IS_EXT_FS(fs_name) || strcmp (fs_name, "xfs")) {
+                        ret = 0;
+                        goto out;
+        }
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.inode_size", brick_index);
+        ret = dict_get_str (dict, key, &inode_size);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"inodeSize",
+                                               "%s", fs_name);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.total_inodes", brick_index);
+        ret = dict_get_uint64 (dict, key, &inodes_total);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"inodesTotal",
+                                               "%"PRIu64, inodes_total);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.free_inodes", brick_index);
+        ret = dict_get_uint64 (dict, key, &inodes_free);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"inodesFree",
+                                               "%"PRIu64, inodes_free);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_mempool (xmlTextWriterPtr writer, dict_t *dict,
+                                   char *prefix)
+{
+        int             ret = -1;
+        int             mempool_count = 0;
+        char            *name = NULL;
+        int             hotcount = 0;
+        int             coldcount = 0;
+        uint64_t        paddedsizeof = 0;
+        uint64_t        alloccount = 0;
+        int             maxalloc = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+
+        /* <mempool> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"mempool");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "%s.mempool-count", prefix);
+        ret = dict_get_int32 (dict, key, &mempool_count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"count",
+                                               "%d", mempool_count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        for (i = 0; i < mempool_count; i++) {
+                /* <pool> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"pool");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.name", prefix, i);
+                ret = dict_get_str (dict, key, &name);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"name",
+                                                       "%s", name);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.hotcount", prefix, i);
+                ret = dict_get_int32 (dict, key, &hotcount);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"hotCount",
+                                                       "%d", hotcount);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.coldcount", prefix, i);
+                ret = dict_get_int32 (dict, key, &coldcount);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"coldCount",
+                                                       "%d", coldcount);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.paddedsizeof",
+                          prefix, i);
+                ret = dict_get_uint64 (dict, key, &paddedsizeof);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"padddedSizeOf", "%"PRIu64,
+                         paddedsizeof);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.alloccount", prefix, i);
+                ret = dict_get_uint64 (dict, key, &alloccount);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"allocCount",
+                                                       "%"PRIu64, alloccount);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.max_alloc", prefix, i);
+                ret = dict_get_int32 (dict, key, &maxalloc);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"maxAlloc",
+                                                       "%d", maxalloc);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.pool-misses", prefix, i);
+                ret = dict_get_uint64 (dict, key, &alloccount);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"poolMisses",
+                                                       "%"PRIu64, alloccount);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.pool%d.max-stdalloc", prefix, i);
+                ret = dict_get_int32 (dict, key, &maxalloc);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"maxStdAlloc",
+                                                       "%d", maxalloc);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+
+                /* </pool> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* </mempool> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_mem (xmlTextWriterPtr writer, dict_t *dict,
+                               int brick_index)
+{
+        int             ret = -1;
+        int             arena = 0;
+        int             ordblks = 0;
+        int             smblks = 0;
+        int             hblks = 0;
+        int             hblkhd = 0;
+        int             usmblks = 0;
+        int             fsmblks = 0;
+        int             uordblks = 0;
+        int             fordblks = 0;
+        int             keepcost = 0;
+        char            key[1024] = {0,};
+
+        /* <memStatus> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"memStatus");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* <mallinfo> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"mallinfo");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "brick%d.mallinfo.arena", brick_index);
+        ret = dict_get_int32 (dict, key, &arena);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"arena",
+                                               "%d", arena);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.mallinfo.ordblks", brick_index);
+        ret = dict_get_int32 (dict, key, &ordblks);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"ordblks",
+                                               "%d", ordblks);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.mallinfo.smblks", brick_index);
+        ret = dict_get_int32 (dict, key, &smblks);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"smblks",
+                                               "%d", smblks);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.mallinfo.hblks", brick_index);
+        ret = dict_get_int32 (dict, key, &hblks);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"hblks",
+                                               "%d", hblks);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.mallinfo.hblkhd", brick_index);
+        ret = dict_get_int32 (dict, key, &hblkhd);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"hblkhd",
+                                               "%d", hblkhd);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.mallinfo.usmblks", brick_index);
+        ret = dict_get_int32 (dict, key, &usmblks);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"usmblks",
+                                               "%d", usmblks);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.mallinfo.fsmblks", brick_index);
+        ret = dict_get_int32 (dict, key, &fsmblks);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"fsmblks",
+                                               "%d", fsmblks);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.mallinfo.uordblks", brick_index);
+        ret = dict_get_int32 (dict, key, &uordblks);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"uordblks",
+                                               "%d", uordblks);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.mallinfo.fordblks", brick_index);
+        ret = dict_get_int32 (dict, key, &fordblks);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"fordblks",
+                                               "%d", fordblks);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d.mallinfo.keepcost", brick_index);
+        ret = dict_get_int32 (dict, key, &keepcost);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"keepcost",
+                                               "%d", keepcost);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* </mallinfo> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "brick%d", brick_index);
+        ret = cli_xml_output_vol_status_mempool (writer, dict, key);
+        if (ret)
+                goto out;
+
+        /* </memStatus> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_clients (xmlTextWriterPtr writer, dict_t *dict,
+                                   int brick_index)
+{
+        int             ret = -1;
+        int             client_count = 0;
+        char            *hostname = NULL;
+        uint64_t        bytes_read = 0;
+        uint64_t        bytes_write = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+
+        /* <clientsStatus> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"clientsStatus");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "brick%d.clientcount", brick_index);
+        ret = dict_get_int32 (dict, key, &client_count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer,
+                                               (xmlChar *)"clientCount",
+                                               "%d", client_count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        for (i = 0; i < client_count; i++) {
+                /* <client> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"client");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.client%d.hostname",
+                          brick_index, i);
+                ret = dict_get_str (dict, key, &hostname);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"hostname",
+                                                       "%s", hostname);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.client%d.bytesread",
+                          brick_index, i);
+                ret = dict_get_uint64 (dict, key, &bytes_read);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"bytesRead",
+                                                       "%"PRIu64, bytes_read);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.client%d.byteswrite",
+                          brick_index, i);
+                ret = dict_get_uint64 (dict, key, &bytes_write);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"bytesWrite",
+                                                       "%"PRIu64, bytes_write);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* </client> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* </clientsStatus> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_inode_entry (xmlTextWriterPtr writer, dict_t *dict,
+                                       char *prefix)
+{
+        int             ret = -1;
+        char            *gfid = NULL;
+        uint64_t        nlookup = 0;
+        uint32_t        ref = 0;
+        int             ia_type = 0;
+        char            key[1024] = {0,};
+
+        /* <inode> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"inode");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "%s.gfid", prefix);
+        ret = dict_get_str (dict, key, &gfid);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"gfid",
+                                               "%s", gfid);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key,0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.nlookup", prefix);
+        ret = dict_get_uint64 (dict, key, &nlookup);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"nLookup",
+                                               "%"PRIu64, nlookup);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key,0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.ref", prefix);
+        ret = dict_get_uint32 (dict, key, &ref);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"ref",
+                                               "%"PRIu32, ref);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key,0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.ia_type", prefix);
+        ret = dict_get_int32 (dict, key, &ia_type);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"iaType",
+                                               "%d", ia_type);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* </inode> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_itable (xmlTextWriterPtr writer, dict_t *dict,
+                                  char *prefix)
+{
+        int             ret = -1;
+        uint32_t        active_size = 0;
+        uint32_t        lru_size = 0;
+        uint32_t        purge_size = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+
+        snprintf (key, sizeof (key), "%s.active_size", prefix);
+        ret = dict_get_uint32 (dict, key, &active_size);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"activeSize",
+                                               "%"PRIu32, active_size);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+        if (active_size != 0) {
+                /* <active> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"active");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                for (i = 0; i < active_size; i++) {
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "%s.active%d", prefix, i);
+                        ret = cli_xml_output_vol_status_inode_entry
+                                (writer, dict, key);
+                        if (ret)
+                                goto out;
+                }
+                /* </active> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.lru_size", prefix);
+        ret = dict_get_uint32 (dict, key, &lru_size);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"lruSize",
+                                               "%"PRIu32, lru_size);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+        if (lru_size != 0) {
+                /* <lru> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"lru");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                for (i = 0; i < lru_size; i++) {
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "%s.lru%d", prefix, i);
+                        ret = cli_xml_output_vol_status_inode_entry
+                                (writer, dict, key);
+                        if (ret)
+                                goto out;
+                }
+                /* </lru> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.purge_size", prefix);
+        ret = dict_get_uint32 (dict, key, &purge_size);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"purgeSize",
+                                               "%"PRIu32, purge_size);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+        if (purge_size != 0) {
+                /* <purge> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"purge");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                for (i = 0; i < purge_size; i++) {
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "%s.purge%d", prefix, i);
+                        ret = cli_xml_output_vol_status_inode_entry
+                                (writer, dict, key);
+                        if (ret)
+                                goto out;
+                }
+                /* </purge> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_inode (xmlTextWriterPtr writer, dict_t *dict,
+                                 int brick_index)
+{
+        int             ret = -1;
+        int             conn_count = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+
+        /* <inodeStatus> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"inodeStatus");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "brick%d.conncount", brick_index);
+        ret = dict_get_int32 (dict, key, &conn_count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"connections",
+                                               "%d", conn_count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        for (i = 0; i < conn_count; i++) {
+                /* <connection> */
+                ret = xmlTextWriterStartElement (writer,
+                                                 (xmlChar *)"connection");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.conn%d.itable",
+                          brick_index, i);
+                ret = cli_xml_output_vol_status_itable (writer, dict, key);
+                if (ret)
+                        goto out;
+
+                /* </connection> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* </inodeStatus> */
+        ret= xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_fdtable (xmlTextWriterPtr writer, dict_t *dict,
+                                   char *prefix)
+{
+        int             ret = -1;
+        int             refcount = 0;
+        uint32_t        maxfds = 0;
+        int             firstfree = 0;
+        int             openfds = 0;
+        int             fd_pid = 0;
+        int             fd_refcount = 0;
+        int             fd_flags = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+
+        /* <fdTable> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"fdTable");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "%s.refcount", prefix);
+        ret = dict_get_int32 (dict, key, &refcount);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"refCount",
+                                               "%d", refcount);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.maxfds", prefix);
+        ret = dict_get_uint32 (dict, key, &maxfds);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"maxFds",
+                                               "%"PRIu32, maxfds);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.firstfree", prefix);
+        ret = dict_get_int32 (dict, key, &firstfree);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"firstFree",
+                                               "%d", firstfree);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.openfds", prefix);
+        ret = dict_get_int32 (dict, key, &openfds);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"openFds",
+                                               "%d", openfds);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        for (i = 0; i < maxfds; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.fdentry%d.pid", prefix, i);
+                ret = dict_get_int32 (dict, key, &fd_pid);
+                if (ret)
+                        continue;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.fdentry%d.refcount",
+                          prefix, i);
+                ret = dict_get_int32 (dict, key, &fd_refcount);
+                if (ret)
+                        continue;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.fdentry%d.flags", prefix, i);
+                ret = dict_get_int32 (dict, key, &fd_flags);
+                if (ret)
+                        continue;
+
+                /* <fd> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"fd");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"entry",
+                                                       "%d", i+1);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"pid",
+                                                       "%d", fd_pid);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"refCount",
+                                                       "%d", fd_refcount);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"flags",
+                                                       "%d", fd_flags);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* </fd> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* </fdTable> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_fd (xmlTextWriterPtr writer, dict_t *dict,
+                              int brick_index)
+{
+        int             ret = -1;
+        int             conn_count = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+
+        /* <fdStatus> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"fdStatus");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "brick%d.conncount", brick_index);
+        ret = dict_get_int32 (dict, key, &conn_count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"connections",
+                                               "%d", conn_count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        for (i = 0; i < conn_count; i++) {
+                /* <connection> */
+                ret = xmlTextWriterStartElement (writer,
+                                                 (xmlChar *)"connection");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.conn%d.fdtable",
+                          brick_index, i);
+                ret = cli_xml_output_vol_status_fdtable (writer, dict, key);
+                if (ret)
+                        goto out;
+
+                /* </connection> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* </fdStatus> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_callframe (xmlTextWriterPtr writer, dict_t *dict,
+                                     char *prefix)
+{
+        int             ret = -1;
+        int             ref_count = 0;
+        char            *translator = NULL;
+        int             complete = 0;
+        char            *parent = NULL;
+        char            *wind_from = NULL;
+        char            *wind_to = NULL;
+        char            *unwind_from = NULL;
+        char            *unwind_to = NULL;
+        char            key[1024] = {0,};
+
+        /* <callFrame> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"callFrame");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "%s.refcount", prefix);
+        ret = dict_get_int32 (dict, key, &ref_count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"refCount",
+                                               "%d", ref_count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.translator", prefix);
+        ret = dict_get_str (dict, key, &translator);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"translator",
+                                               "%s", translator);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.complete", prefix);
+        ret = dict_get_int32 (dict, key, &complete);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"complete",
+                                               "%d", complete);
+        XML_RET_CHECK_AND_GOTO (ret ,out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.parent", prefix);
+        ret = dict_get_str (dict, key, &parent);
+        if (!ret) {
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"parent",
+                                                       "%s", parent);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.windfrom", prefix);
+        ret = dict_get_str (dict, key, &wind_from);
+        if (!ret) {
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"windFrom",
+                                                       "%s", wind_from);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.windto", prefix);
+        ret = dict_get_str (dict, key, &wind_to);
+        if (!ret) {
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"windTo",
+                                                       "%s", wind_to);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.unwindfrom", prefix);
+        ret = dict_get_str (dict, key, &unwind_from);
+        if (!ret) {
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"unwindFrom",
+                                                       "%s", unwind_from);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.unwindto", prefix);
+        ret = dict_get_str (dict, key, &unwind_to);
+        if (!ret) {
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"unwindTo",
+                                                       "%s", unwind_to);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* </callFrame> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_callstack (xmlTextWriterPtr writer, dict_t *dict,
+                                     char *prefix)
+{
+        int             ret = -1;
+        int             uid = 0;
+        int             gid = 0;
+        int             pid = 0;
+        uint64_t        unique = 0;
+        int             frame_count = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+
+        /* <callStack> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"callStack");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "%s.uid", prefix);
+        ret = dict_get_int32 (dict, key, &uid);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"uid",
+                                               "%d", uid);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.gid", prefix);
+        ret = dict_get_int32 (dict, key, &gid);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"gid",
+                                               "%d", gid);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.pid", prefix);
+        ret = dict_get_int32 (dict, key, &pid);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"pid",
+                                               "%d", pid);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.unique", prefix);
+        ret = dict_get_uint64 (dict, key, &unique);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"unique",
+                                               "%"PRIu64, unique);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.count", prefix);
+        ret = dict_get_int32 (dict, key, &frame_count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"frameCount",
+                                               "%d", frame_count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        for (i = 0; i < frame_count; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.frame%d", prefix, i);
+                ret = cli_xml_output_vol_status_callframe (writer, dict,
+                                                           key);
+                if (ret)
+                        goto out;
+        }
+
+        /* </callStack> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_callpool (xmlTextWriterPtr writer, dict_t *dict,
+                                    int brick_index)
+{
+        int             ret = -1;
+        int             call_count = 0;
+        char            key[1024] = {0,};
+        int             i = 0;
+
+        /* <callpoolStatus> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"callpoolStatus");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "brick%d.callpool.count", brick_index);
+        ret = dict_get_int32 (dict, key, &call_count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"count",
+                                               "%d", call_count);
+
+        for (i = 0; i < call_count; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "brick%d.callpool.stack%d",
+                          brick_index, i);
+                ret = cli_xml_output_vol_status_callstack (writer, dict,
+                                                           key);
+                if (ret)
+                        goto out;
+        }
+
+        /* </callpoolStatus> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+#endif
+
+int
+cli_xml_output_vol_status_begin (cli_local_t *local, int op_ret, int op_errno,
+                                 char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+
+        ret = cli_begin_xml_output (&(local->writer), &(local->doc));
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = cli_xml_output_common (local->writer, op_ret, op_errno,
+                                     op_errstr);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* <volStatus> */
+        ret = xmlTextWriterStartElement (local->writer,
+                                         (xmlChar *) "volStatus");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* <volumes> */
+        ret = xmlTextWriterStartElement (local->writer, (xmlChar *)"volumes");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+int
+cli_xml_output_vol_status_end (cli_local_t *local)
+{
+#if (HAVE_LIB_XML)
+        int     ret = -1;
+
+        /* </volumes> */
+        ret = xmlTextWriterEndElement (local->writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* </volStatus> */
+        ret = xmlTextWriterEndElement (local->writer);
+        XML_RET_CHECK_AND_GOTO(ret, out);
+
+        ret = cli_end_xml_output (local->writer, local->doc);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+#if (HAVE_LIB_XML)
+int
+cli_xml_output_remove_brick_task_params (xmlTextWriterPtr writer, dict_t *dict,
+                                         char *prefix)
+{
+        int             ret = -1;
+        char            key[1024] = {0,};
+        int             count = 0;
+        int             i = 0;
+        char            *brick = NULL;
+
+        /* <params> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"params");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "%s.count", prefix);
+        ret = dict_get_int32 (dict, key, &count);
+        if (ret)
+                goto out;
+
+        for (i = 1; i <= count; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%s.brick%d", prefix, i);
+                ret = dict_get_str (dict, key, &brick);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"brick",
+                                                       "%s", brick);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+                brick = NULL;
+        }
+
+        /* </param> */
+        ret = xmlTextWriterEndElement (writer);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_replace_brick_task_params (xmlTextWriterPtr writer, dict_t *dict,
+                                          char *prefix)
+{
+
+        int             ret = -1;
+        char            key[1024] = {0,};
+        char            *brick = NULL;
+
+        /* <params> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"params");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "%s.src-brick", prefix);
+        ret = dict_get_str (dict, key, &brick);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"srcBrick",
+                                               "%s", brick);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%s.dst-brick", prefix);
+        ret = dict_get_str (dict, key, &brick);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"dstBrick",
+                                               "%s", brick);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+
+        /* </param> */
+        ret = xmlTextWriterEndElement (writer);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_tasks (cli_local_t *local, dict_t *dict) {
+        int                     ret = -1;
+        char                    *task_type = NULL;
+        char                    *task_id_str = NULL;
+        int                     status = 0;
+        int                     tasks = 0;
+        char                    key[1024] = {0,};
+        int                     i = 0;
+
+        /* <tasks> */
+        ret = xmlTextWriterStartElement (local->writer, (xmlChar *)"tasks");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_int32 (dict, "tasks", &tasks);
+        if (ret)
+                goto out;
+
+        for (i = 0; i < tasks; i++) {
+                /* <task> */
+                ret = xmlTextWriterStartElement (local->writer,
+                                                 (xmlChar *)"task");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "task%d.type", i);
+                ret = dict_get_str (dict, key, &task_type);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"type",
+                                                       "%s", task_type);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "task%d.id", i);
+                ret = dict_get_str (dict, key, &task_id_str);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"id",
+                                                       "%s", task_id_str);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "task%d.status", i);
+                ret = dict_get_int32 (dict, key, &status);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"status",
+                                                       "%d", status);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                if (!strcmp (task_type, "Replace brick")) {
+                    if (status) {
+                        status = GF_DEFRAG_STATUS_COMPLETE;
+                    } else {
+                        status = GF_DEFRAG_STATUS_STARTED;
+                    }
+                }
+
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"statusStr",
+                                                       "%s",
+                                             cli_vol_task_status_str[status]);
+
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "task%d", i);
+                if (!strcmp (task_type, "Replace brick")) {
+                        ret = cli_xml_output_replace_brick_task_params
+                                (local->writer, dict, key);
+                        if (ret)
+                                goto out;
+                } else if (!strcmp (task_type, "Remove brick")) {
+                        ret = cli_xml_output_remove_brick_task_params
+                                (local->writer, dict, key);
+                        if (ret)
+                                goto out;
+                }
+
+
+                /* </task> */
+                ret = xmlTextWriterEndElement (local->writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* </tasks> */
+        ret = xmlTextWriterEndElement (local->writer);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_status_tasks_detail (cli_local_t *local, dict_t *dict)
+{
+        int    ret     = -1;
+        char  *volname = NULL;
+
+        /*<volume>*/
+        ret = xmlTextWriterStartElement (local->writer, (xmlChar *)"volume");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (local->writer,
+                                               (xmlChar *)"volName", "%s",
+                                               volname);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = cli_xml_output_vol_status_tasks (local, dict);
+        if (ret)
+                goto out;
+
+        /* </volume> */
+        ret = xmlTextWriterEndElement (local->writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        return ret;
+}
+#endif
+
+int
+cli_xml_output_vol_status (cli_local_t *local, dict_t *dict)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        char                    *volname = NULL;
+        int                     brick_count = 0;
+        int                     brick_index_max = -1;
+        int                     other_count = 0;
+        int                     index_max = 0;
+        uint32_t                cmd = GF_CLI_STATUS_NONE;
+        int                     online = 0;
+        gf_boolean_t            node_present = _gf_true;
+        int                     i;
+
+
+        /* <volume> */
+        ret = xmlTextWriterStartElement (local->writer, (xmlChar *)"volume");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (local->writer,
+                                               (xmlChar *)"volName", "%s",
+                                               volname);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_int32 (dict, "count", &brick_count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (local->writer,
+                                               (xmlChar *)"nodeCount", "%d",
+                                               brick_count);
+        if (ret)
+                goto out;
+
+        ret = dict_get_uint32 (dict, "cmd", &cmd);
+        if (ret)
+                goto out;
+
+        ret = dict_get_int32 (dict, "brick-index-max", &brick_index_max);
+        if (ret)
+                goto out;
+        ret = dict_get_int32 (dict, "other-count", &other_count);
+        if (ret)
+                goto out;
+
+        index_max = brick_index_max + other_count;
+
+        for (i = 0; i <= index_max; i++) {
+                /* <node> */
+                ret = xmlTextWriterStartElement (local->writer,
+                                                 (xmlChar *)"node");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = cli_xml_output_vol_status_common (local->writer, dict, i,
+                                                        &online, &node_present);
+                if (ret) {
+                        if (node_present)
+                                goto out;
+                        else
+                                continue;
+                }
+
+                switch (cmd & GF_CLI_STATUS_MASK) {
+                case GF_CLI_STATUS_DETAIL:
+                        ret = cli_xml_output_vol_status_detail (local->writer,
+                                                                dict, i);
+                        if (ret)
+                                goto out;
+                        break;
+
+                case GF_CLI_STATUS_MEM:
+                        if (online) {
+                                ret = cli_xml_output_vol_status_mem
+                                        (local->writer, dict, i);
+                                if (ret)
+                                        goto out;
+                        }
+                        break;
+
+                case GF_CLI_STATUS_CLIENTS:
+                        if (online) {
+                                ret = cli_xml_output_vol_status_clients
+                                        (local->writer, dict, i);
+                                if (ret)
+                                        goto out;
+                        }
+                        break;
+
+                case GF_CLI_STATUS_INODE:
+                        if (online) {
+                                ret = cli_xml_output_vol_status_inode
+                                        (local->writer, dict, i);
+                                if (ret)
+                                        goto out;
+                        }
+                        break;
+
+                case GF_CLI_STATUS_FD:
+                        if (online) {
+                                ret = cli_xml_output_vol_status_fd
+                                        (local->writer, dict, i);
+                                if (ret)
+                                        goto out;
+                        }
+                        break;
+
+                case GF_CLI_STATUS_CALLPOOL:
+                        if (online) {
+                                ret = cli_xml_output_vol_status_callpool
+                                        (local->writer, dict, i);
+                                if (ret)
+                                        goto out;
+                        }
+                        break;
+                default:
+                        break;
+
+                }
+                /* </node> */
+                ret = xmlTextWriterEndElement (local->writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* Tasks are only present when a normal volume status call is done on a
+         * single volume or on all volumes
+         */
+        if (((cmd & GF_CLI_STATUS_MASK) == GF_CLI_STATUS_NONE) &&
+            (cmd & (GF_CLI_STATUS_VOL|GF_CLI_STATUS_ALL))) {
+                ret = cli_xml_output_vol_status_tasks (local, dict);
+                if (ret)
+                        goto out;
+        }
+
+        /* </volume> */
+        ret = xmlTextWriterEndElement (local->writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+#if (HAVE_LIB_XML)
+int
+cli_xml_output_vol_top_rw_perf (xmlTextWriterPtr writer, dict_t *dict,
+                                int brick_index, int member_index)
+{
+        int        ret = -1;
+        char      *filename = NULL;
+        uint64_t   throughput = 0;
+        long int   time_sec = 0;
+        long int   time_usec = 0;
+        char       timestr[256] = {0,};
+        char       key[1024] = {0,};
+
+        /* <file> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"file");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "%d-filename-%d", brick_index,
+                  member_index);
+        ret = dict_get_str (dict, key, &filename);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"filename",
+                                               "%s", filename);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%d-value-%d", brick_index, member_index);
+        ret = dict_get_uint64 (dict, key, &throughput);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"count",
+                                               "%"PRIu64, throughput);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%d-time-sec-%d", brick_index,
+                  member_index);
+        ret = dict_get_int32 (dict, key, (int32_t *)&time_sec);
+        if (ret)
+                goto out;
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%d-time-usec-%d", brick_index,
+                  member_index);
+        ret = dict_get_int32 (dict, key, (int32_t *)&time_usec);
+        if (ret)
+                goto out;
+
+        gf_time_fmt (timestr, sizeof timestr, time_sec, gf_timefmt_FT);
+        snprintf (timestr + strlen (timestr),
+                  sizeof timestr - strlen (timestr),
+                  ".%"GF_PRI_SUSECONDS, time_usec);
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"time",
+                                               "%s", timestr);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* </file> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+int
+cli_xml_output_vol_top_other (xmlTextWriterPtr writer, dict_t *dict,
+                                int brick_index, int member_index)
+{
+        int             ret = -1;
+        char            *filename = NULL;
+        uint64_t        count = 0;
+        char            key[1024] = {0,};
+
+        /* <file> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"file");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        snprintf (key, sizeof (key), "%d-filename-%d", brick_index,
+                  member_index);
+        ret = dict_get_str (dict, key, &filename);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"filename",
+                                               "%s", filename);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%d-value-%d", brick_index, member_index);
+        ret = dict_get_uint64 (dict, key, &count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"count",
+                                               "%"PRIu64, count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* </file> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+#endif
+
+int
+cli_xml_output_vol_top (dict_t *dict, int op_ret, int op_errno,
+                        char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+        int                     brick_count = 0;
+        int                     top_op = GF_CLI_TOP_NONE;
+        char                    *brick_name = NULL;
+        int                     members = 0;
+        uint64_t                current_open = 0;
+        uint64_t                max_open = 0;
+        char                    *max_open_time = NULL;
+        double                  throughput = 0.0;
+        double                  time_taken = 0.0;
+        char                    key[1024] = {0,};
+        int                     i = 0;
+        int                     j = 0;
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        /* <volTop> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"volTop");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_int32 (dict, "count", &brick_count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"brickCount",
+                                               "%d", brick_count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_int32 (dict, "1-top-op", &top_op);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"topOp",
+                                               "%d", top_op);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        while (i < brick_count) {
+                i++;
+
+                /* <brick> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"brick");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%d-brick", i);
+                ret = dict_get_str (dict, key, &brick_name);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"name",
+                                                       "%s", brick_name);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key , sizeof (key), "%d-members", i);
+                ret = dict_get_int32 (dict, key, &members);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"members",
+                                                       "%d", members);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                switch (top_op) {
+                case GF_CLI_TOP_OPEN:
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "%d-current-open", i);
+                        ret = dict_get_uint64 (dict, key, &current_open);
+                        if (ret)
+                                goto out;
+                        ret = xmlTextWriterWriteFormatElement
+                                (writer, (xmlChar *)"currentOpen", "%"PRIu64,
+                                 current_open);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "%d-max-open", i);
+                        ret = dict_get_uint64 (dict, key, &max_open);
+                        if (ret)
+                                goto out;
+                        ret = xmlTextWriterWriteFormatElement
+                                (writer, (xmlChar *)"maxOpen", "%"PRIu64,
+                                 max_open);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "%d-max-openfd-time", i);
+                        ret = dict_get_str (dict, key, &max_open_time);
+                        if (ret)
+                                goto out;
+                        ret = xmlTextWriterWriteFormatElement
+                                (writer, (xmlChar *)"maxOpenTime", "%s",
+                                 max_open_time);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+
+                case GF_CLI_TOP_READ:
+                case GF_CLI_TOP_WRITE:
+                case GF_CLI_TOP_OPENDIR:
+                case GF_CLI_TOP_READDIR:
+
+                        break;
+
+                case GF_CLI_TOP_READ_PERF:
+                case GF_CLI_TOP_WRITE_PERF:
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "%d-throughput", i);
+                        ret = dict_get_double (dict, key, &throughput);
+                        if (!ret) {
+                                memset (key, 0, sizeof (key));
+                                snprintf (key, sizeof (key), "%d-time", i);
+                                ret = dict_get_double (dict, key, &time_taken);
+                        }
+
+                        if (!ret) {
+                                ret = xmlTextWriterWriteFormatElement
+                                        (writer, (xmlChar *)"throughput",
+                                         "%f", throughput);
+                                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                                ret = xmlTextWriterWriteFormatElement
+                                        (writer, (xmlChar *)"timeTaken",
+                                         "%f", time_taken);
+                        }
+
+                        break;
+
+                default:
+                        ret = -1;
+                        goto out;
+                }
+
+                for (j = 1; j <= members; j++) {
+                        if (top_op == GF_CLI_TOP_READ_PERF ||
+                            top_op == GF_CLI_TOP_WRITE_PERF) {
+                                ret = cli_xml_output_vol_top_rw_perf
+                                        (writer, dict, i, j);
+                        } else {
+                                ret = cli_xml_output_vol_top_other
+                                        (writer, dict, i, j);
+                        }
+                        if (ret)
+                                goto out;
+                }
+
+
+                /* </brick> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* </volTop> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+        ret = cli_end_xml_output (writer, doc);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+#if (HAVE_LIB_XML)
+int
+cli_xml_output_vol_profile_stats (xmlTextWriterPtr writer, dict_t *dict,
+                                  int brick_index, int interval)
+{
+        int                     ret = -1;
+        uint64_t                read_count = 0;
+        uint64_t                write_count = 0;
+        uint64_t                hits = 0;
+        double                  avg_latency = 0.0;
+        double                  max_latency = 0.0;
+        double                  min_latency = 0.0;
+        uint64_t                duration = 0;
+        uint64_t                total_read = 0;
+        uint64_t                total_write = 0;
+        char                    key[1024] = {0};
+        int                     i = 0;
+
+        /* <cumulativeStats> || <intervalStats> */
+        if (interval == -1)
+                ret = xmlTextWriterStartElement (writer,
+                                                 (xmlChar *)"cumulativeStats");
+        else
+                ret = xmlTextWriterStartElement (writer,
+                                                 (xmlChar *)"intervalStats");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* <blockStats> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"blockStats");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        for (i = 0; i < 32; i++) {
+                /* <block> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"block");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"size", "%"PRIu32, (1 << i));
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%d-%d-read-%d", brick_index,
+                          interval, (1 << i));
+                ret = dict_get_uint64 (dict, key, &read_count);
+                if (ret)
+                        read_count = 0;
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"reads", "%"PRIu64, read_count);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%d-%d-write-%d", brick_index,
+                          interval, (1 << i));
+                ret = dict_get_uint64 (dict, key, &write_count);
+                if (ret)
+                        write_count = 0;
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"writes", "%"PRIu64, write_count);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* </block> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* </blockStats> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* <fopStats> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"fopStats");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%d-%d-%d-hits", brick_index,
+                          interval, i);
+                ret = dict_get_uint64 (dict, key, &hits);
+                if (ret)
+                        goto cont;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%d-%d-%d-avglatency", brick_index,
+                          interval, i);
+                ret = dict_get_double (dict, key, &avg_latency);
+                if (ret)
+                        goto cont;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%d-%d-%d-minlatency", brick_index,
+                          interval, i);
+                ret = dict_get_double (dict, key, &min_latency);
+                if (ret)
+                        goto cont;
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%d-%d-%d-maxlatency", brick_index,
+                          interval, i);
+                ret = dict_get_double (dict, key, &max_latency);
+                if (ret)
+                        goto cont;
+
+                /* <fop> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"fop");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"name","%s", gf_fop_list[i]);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"hits", "%"PRIu64, hits);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"avgLatency", "%f", avg_latency);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"minLatency", "%f", min_latency);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"maxLatency", "%f", max_latency);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* </fop> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+cont:
+                hits = 0;
+                avg_latency = 0.0;
+                min_latency = 0.0;
+                max_latency = 0.0;
+        }
+
+        /* </fopStats> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%d-%d-duration", brick_index, interval);
+        ret = dict_get_uint64 (dict, key, &duration);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"duration",
+                                               "%"PRIu64, duration);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%d-%d-total-read", brick_index, interval);
+        ret = dict_get_uint64 (dict, key, &total_read);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"totalRead",
+                                               "%"PRIu64, total_read);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        memset (key, 0, sizeof (key));
+        snprintf (key, sizeof (key), "%d-%d-total-write", brick_index, interval);
+        ret = dict_get_uint64 (dict, key, &total_write);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"totalWrite",
+                                               "%"PRIu64, total_write);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* </cumulativeStats> || </intervalStats> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+#endif
+
+int
+cli_xml_output_vol_profile (dict_t *dict, int op_ret, int op_errno,
+                            char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+        char                    *volname = NULL;
+        int                     op = GF_CLI_STATS_NONE;
+        int                     brick_count = 0;
+        char                    *brick_name = NULL;
+        int                     interval = 0;
+        char                    key[1024] = {0,};
+        int                     i = 0;
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        /* <volProfile> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"volProfile");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_str (dict, "volname", &volname);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"volname",
+                                               "%s", volname);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_int32 (dict, "op", &op);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"profileOp",
+                                               "%d", op);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        if (op != GF_CLI_STATS_INFO)
+                goto cont;
+
+        ret = dict_get_int32 (dict, "count", &brick_count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"brickCount",
+                                               "%d", brick_count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        while (i < brick_count) {
+                i++;
+
+                /* <brick> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"brick");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                snprintf (key, sizeof (key), "%d-brick", i);
+                ret = dict_get_str (dict, key, &brick_name);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"brickName", "%s", brick_name);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                snprintf (key, sizeof (key), "%d-cumulative", i);
+                ret = dict_get_int32 (dict, key, &interval);
+                if (ret == 0) {
+                        ret = cli_xml_output_vol_profile_stats
+                                (writer, dict, i, interval);
+                        if (ret)
+                                goto out;
+                }
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "%d-interval", i);
+                ret = dict_get_int32 (dict, key, &interval);
+                if (ret == 0) {
+                        ret = cli_xml_output_vol_profile_stats
+                                (writer, dict, i, interval);
+                        if (ret)
+                                goto out;
+                }
+
+                /* </brick> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+cont:
+        /* </volProfile> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = cli_end_xml_output (writer, doc);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+int
+cli_xml_output_vol_list (dict_t *dict, int op_ret, int op_errno,
+                         char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+        int                     count = 0;
+        char                    *volname = NULL;
+        char                    key[1024] = {0,};
+        int                     i = 0;
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        /* <volList> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"volList");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_int32 (dict, "count", &count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"count",
+                                               "%d", count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        for (i = 0; i < count; i++) {
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d", i);
+                ret = dict_get_str (dict, key, &volname);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"volume",
+                                                       "%s", volname);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* </volList> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = cli_end_xml_output (writer, doc);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+#if (HAVE_LIB_XML)
+int
+cli_xml_output_vol_info_option (xmlTextWriterPtr writer, char *substr,
+                                char *optstr, char *valstr)
+{
+        int             ret = -1;
+        char            *ptr1 = NULL;
+        char            *ptr2 = NULL;
+
+        ptr1 = substr;
+        ptr2 = optstr;
+
+        while (ptr1) {
+                if (*ptr1 != *ptr2)
+                        break;
+                ptr1++;
+                ptr2++;
+                if (!ptr1)
+                        goto out;
+                if (!ptr2)
+                        goto out;
+        }
+        if (*ptr2 == '\0')
+                goto out;
+
+        /* <option> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"option");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"name",
+                                               "%s", ptr2);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"value",
+                                               "%s", valstr);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* </option> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+
+struct tmp_xml_option_logger {
+        char *key;
+        xmlTextWriterPtr writer;
+};
+
+static int
+_output_vol_info_option (dict_t *d, char *k, data_t *v,
+                         void *data)
+{
+        int                           ret   = 0;
+        char                         *ptr   = NULL;
+        struct tmp_xml_option_logger *tmp   = NULL;
+
+        tmp = data;
+
+        ptr = strstr (k, "option.");
+        if (!ptr)
+                goto out;
+
+        if (!v) {
+                ret = -1;
+                goto out;
+        }
+        ret = cli_xml_output_vol_info_option (tmp->writer, tmp->key, k,
+                                              v->data);
+
+out:
+        return ret;
+}
+
+int
+cli_xml_output_vol_info_options (xmlTextWriterPtr writer, dict_t *dict,
+                                 char *prefix)
+{
+        int             ret = -1;
+        int             opt_count = 0;
+        char            key[1024] = {0,};
+        struct tmp_xml_option_logger tmp = {0,};
+
+        snprintf (key, sizeof (key), "%s.opt_count", prefix);
+        ret = dict_get_int32 (dict, key, &opt_count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"optCount",
+                                               "%d", opt_count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* <options> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"options");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+        snprintf (key, sizeof (key), "%s.option.", prefix);
+
+        tmp.key = key;
+        tmp.writer = writer;
+        ret = dict_foreach (dict, _output_vol_info_option, &tmp);
+        if (ret)
+                goto out;
+
+        /* </options> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+#endif
+
+int
+cli_xml_output_vol_info (cli_local_t *local, dict_t *dict)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = 0;
+        int                     count = 0;
+        char                    *volname = NULL;
+        char                    *volume_id = NULL;
+        char                    *uuid = NULL;
+        int                     type = 0;
+        int                     status = 0;
+        int                     brick_count = 0;
+        int                     dist_count = 0;
+        int                     stripe_count = 0;
+        int                     replica_count = 0;
+        int                     transport = 0;
+        char                    *brick = NULL;
+        char                    key[1024] = {0,};
+        int                     i = 0;
+        int                     j = 1;
+        char                    *caps = NULL;
+        int                     k __attribute__((unused)) = 0;
+        char                    *snap_volume = NULL;
+
+        ret = dict_get_int32 (dict, "count", &count);
+        if (ret)
+                goto out;
+
+        for (i = 0; i < count; i++) {
+                /* <volume> */
+                ret = xmlTextWriterStartElement (local->writer,
+                                                 (xmlChar *)"volume");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d.name", i);
+                ret = dict_get_str (dict, key, &volname);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"name",
+                                                       "%s", volname);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d.volume_id", i);
+                ret = dict_get_str (dict, key, &volume_id);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"id",
+                                                       "%s", volume_id);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d.status", i);
+                ret = dict_get_int32 (dict, key, &status);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"status",
+                                                       "%d", status);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d.snap_volume", i);
+                ret = dict_get_str (dict, key, &snap_volume);
+                if (ret)
+                        goto out;
+                if (snap_volume) {
+                        ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                         (xmlChar *)"snapVol",
+                                                          "%s", snap_volume);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+                }
+
+                ret =xmlTextWriterWriteFormatElement
+                        (local->writer, (xmlChar *)"statusStr", "%s",
+                         cli_vol_status_str[status]);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d.brick_count", i);
+                ret = dict_get_int32 (dict, key, &brick_count);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"brickCount",
+                                                       "%d", brick_count);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d.dist_count", i);
+                ret = dict_get_int32 (dict, key, &dist_count);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"distCount",
+                                                       "%d", dist_count);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d.stripe_count", i);
+                ret = dict_get_int32 (dict, key, &stripe_count);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"stripeCount",
+                                                       "%d", stripe_count);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d.replica_count", i);
+                ret = dict_get_int32 (dict, key, &replica_count);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"replicaCount",
+                                                       "%d", replica_count);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d.type", i);
+                ret = dict_get_int32 (dict, key, &type);
+                if (ret)
+                        goto out;
+                /* For Distributed-(stripe,replicate,stipe-replicate) types */
+                if ((type > 0) && (dist_count < brick_count))
+                        type += 3;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"type",
+                                                       "%d", type);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"typeStr",
+                                                       "%s",
+                                                       cli_vol_type_str[type]);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d.transport", i);
+                ret = dict_get_int32 (dict, key, &transport);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (local->writer,
+                                                       (xmlChar *)"transport",
+                                                       "%d", transport);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+#ifdef HAVE_BD_XLATOR
+                /* <xlators> */
+                ret = xmlTextWriterStartElement (local->writer,
+                                                 (xmlChar *)"xlators");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                for (k = 0; ; k++) {
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key),"volume%d.xlator%d", i, k);
+                        ret = dict_get_str (dict, key, &caps);
+                        if (ret)
+                                break;
+
+                        /* <xlator> */
+                        ret = xmlTextWriterStartElement (local->writer,
+                                                         (xmlChar *)"xlator");
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+
+                        ret = xmlTextWriterWriteFormatElement
+                                (local->writer, (xmlChar *)"name", "%s", caps);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+
+                        /* <capabilities> */
+                        ret = xmlTextWriterStartElement (local->writer,
+                                                         (xmlChar *)
+                                                         "capabilities");
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+
+                        j = 0;
+                        for (j = 0; ;j++) {
+                                memset (key, 0, sizeof (key));
+                                snprintf (key, sizeof (key),
+                                          "volume%d.xlator%d.caps%d", i, k, j);
+                                ret = dict_get_str (dict, key, &caps);
+                                if (ret)
+                                        break;
+                                ret = xmlTextWriterWriteFormatElement
+                                        (local->writer, (xmlChar *)"capability",
+                                         "%s", caps);
+                                XML_RET_CHECK_AND_GOTO (ret, out);
+                        }
+                        /* </capabilities> */
+                        ret = xmlTextWriterEndElement (local->writer);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+                        /* </xlator> */
+                        ret = xmlTextWriterEndElement (local->writer);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+                }
+                ret = xmlTextWriterFullEndElement (local->writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+                /* </xlators> */
+#else
+                caps = 0; /* Avoid compiler warnings when BD not enabled */
+#endif
+                j = 1;
+
+                /* <bricks> */
+                ret = xmlTextWriterStartElement (local->writer,
+                                                 (xmlChar *)"bricks");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+                while (j <= brick_count) {
+                        ret = xmlTextWriterStartElement
+                                (local->writer, (xmlChar *)"brick");
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "volume%d.brick%d.uuid",
+                                  i, j);
+                        ret = dict_get_str (dict, key, &uuid);
+                        if (ret)
+                                goto out;
+                        ret = xmlTextWriterWriteFormatAttribute
+                                (local->writer, (xmlChar *)"uuid", "%s",
+                                 uuid);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+
+                        memset (key, 0, sizeof (key));
+                        snprintf (key, sizeof (key), "volume%d.brick%d", i, j);
+                        ret = dict_get_str (dict, key, &brick);
+                        if (ret)
+                                goto out;
+                        ret = xmlTextWriterWriteFormatString
+                                (local->writer, "%s", brick);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+
+                        /* </brick> */
+                        ret = xmlTextWriterEndElement (local->writer);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+
+                        j++;
+                }
+                /* </bricks> */
+                ret = xmlTextWriterEndElement (local->writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "volume%d", i);
+                ret = cli_xml_output_vol_info_options (local->writer, dict,
+                                                       key);
+                if (ret)
+                        goto out;
+
+                /* </volume> */
+                ret = xmlTextWriterEndElement (local->writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        if (volname) {
+                GF_FREE (local->get_vol.volname);
+                local->get_vol.volname = gf_strdup (volname);
+                local->vol_count += count;
+        }
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+int
+cli_xml_output_vol_info_begin (cli_local_t *local, int op_ret, int op_errno,
+                               char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+
+        GF_ASSERT (local);
+
+        ret = cli_begin_xml_output (&(local->writer), &(local->doc));
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (local->writer, op_ret, op_errno,
+                                     op_errstr);
+        if (ret)
+                goto out;
+
+        /* <volInfo> */
+        ret = xmlTextWriterStartElement (local->writer, (xmlChar *)"volInfo");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* <volumes> */
+        ret = xmlTextWriterStartElement (local->writer, (xmlChar *)"volumes");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* Init vol count */
+        local->vol_count = 0;
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+int
+cli_xml_output_vol_info_end (cli_local_t *local)
+{
+#if (HAVE_LIB_XML)
+        int             ret = -1;
+
+        GF_ASSERT (local);
+
+        ret = xmlTextWriterWriteFormatElement (local->writer,
+                                               (xmlChar *)"count",
+                                               "%d", local->vol_count);
+
+        /* </volumes> */
+        ret = xmlTextWriterEndElement (local->writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* </volInfo> */
+        ret = xmlTextWriterEndElement (local->writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = cli_end_xml_output (local->writer, local->doc);
+
+out:
+        gf_log ("cli", GF_LOG_ERROR, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+int
+cli_xml_output_vol_quota_limit_list (char *volname, char *limit_list,
+                                     int op_ret, int op_errno,
+                                     char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+        int64_t                 size = 0;
+        int64_t                 limit_value = 0;
+        int                     i = 0;
+        int                     j = 0;
+        int                     k = 0;
+        int                     len = 0;
+        char                    *size_str = NULL;
+        char                    path[PATH_MAX] = {0,};
+        char                    ret_str[1024] = {0,};
+        char                    value[1024] = {0,};
+        char                    mountdir[] = "/tmp/mountXXXXXX";
+        char                    abspath[PATH_MAX] = {0,};
+        runner_t                runner = {0,};
+
+        GF_ASSERT (volname);
+        GF_ASSERT (limit_list);
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        /* <volQuota> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"volQuota");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        if (!limit_list)
+                goto cont;
+
+        len = strlen (limit_list);
+        if (len == 0)
+                goto cont;
+
+        if (mkdtemp (mountdir) == NULL) {
+                gf_log ("cli", GF_LOG_ERROR, "failed to create a temporary"
+                        " mount directory");
+                ret = -1;
+                goto out;
+        }
+
+        ret = runcmd (SBIN_DIR"/glusterfs", "-s", "localhost",
+                      "--volfile-id", volname, "-l",
+                      DEFAULT_LOG_FILE_DIRECTORY"/quota-list-xml.log",
+                      mountdir, NULL);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR,
+                        "failed to mount glusterfs client");
+                ret = -1;
+                goto rm_dir;
+        }
+
+        while (i < len) {
+                j = 0;
+                k = 0;
+                size = 0;
+
+                while (limit_list[i] != ':')
+                        path[k++] = limit_list[i++];
+                path[k] = '\0';
+
+                i++;
+
+                while (limit_list[i] != ',' && limit_list[i] != '\0')
+                        value[j++] = limit_list[i++];
+                i++;
+
+                snprintf (abspath, sizeof (abspath), "%s/%s", mountdir, path);
+                ret = sys_lgetxattr (abspath, "trusted.limit.list",
+                                     (void *)ret_str, 4096);
+                if (ret >= 0) {
+                        sscanf (ret_str, "%"SCNd64",%"SCNd64, &size,
+                                &limit_value);
+                        size_str = gf_uint64_2human_readable ((uint64_t)size);
+                }
+
+                /* <quota> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"quota");
+                XML_RET_CHECK_AND_GOTO (ret, unmount);
+
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"path", "%s", path);
+                XML_RET_CHECK_AND_GOTO (ret, unmount);
+
+                ret = xmlTextWriterWriteFormatElement
+                        (writer, (xmlChar *)"limit", "%s", value);
+                XML_RET_CHECK_AND_GOTO (ret, unmount);
+
+                if (size_str) {
+                        ret = xmlTextWriterWriteFormatElement
+                                (writer, (xmlChar *)"size", "%s", size_str);
+                        XML_RET_CHECK_AND_GOTO (ret, unmount);
+                        GF_FREE (size_str);
+                } else {
+                        ret = xmlTextWriterWriteFormatElement
+                                (writer, (xmlChar *)"size", "%"PRId64, size);
+                        XML_RET_CHECK_AND_GOTO (ret, unmount);
+                }
+
+                /* </quota> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, unmount);
+
+        }
+
+unmount:
+        runinit (&runner);
+        runner_add_args (&runner, "umount",
+#if GF_LINUX_HOST_OS
+                         "-l",
+#endif
+                         mountdir, NULL);
+        ret = runner_run_reuse (&runner);
+        if (ret)
+                runner_log (&runner, "cli", GF_LOG_WARNING, "error executing");
+        runner_end (&runner);
+
+rm_dir:
+        rmdir (mountdir);
+
+cont:
+        /* </volQuota> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = cli_end_xml_output (writer, doc);
+
+out:
+        GF_FREE (size_str);
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+int
+cli_xml_output_peer_status (dict_t *dict, int op_ret, int op_errno,
+                            char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+        int                     count = 0;
+        char                    *uuid = NULL;
+        char                    *hostname = NULL;
+        int                     connected = 0;
+        int                     state_id = 0;
+        char                    *state_str = NULL;
+        int                     i = 1;
+        char                    key[1024] = {0,};
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        /* <peerStatus> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"peerStatus");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        if (!dict)
+                goto cont;
+
+        ret = dict_get_int32 (dict, "count", &count);
+        if (ret)
+                goto out;
+
+        while (i <= count) {
+                /* <peer> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"peer");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "friend%d.uuid", i);
+                ret = dict_get_str (dict, key, &uuid);
+                if (ret)
+                        goto out;
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"uuid",
+                                                       "%s", uuid);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "friend%d.hostname", i);
+                ret = dict_get_str (dict, key, &hostname);
+                if (ret)
+                        goto out;
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"hostname",
+                                                       "%s", hostname);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "friend%d.connected", i);
+                ret = dict_get_int32 (dict, key, &connected);
+                if (ret)
+                        goto out;
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"connected",
+                                                       "%d", connected);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "friend%d.stateId", i);
+                ret = dict_get_int32 (dict, key, &state_id);
+                if (!ret) {
+                        /* ignore */
+
+                        ret = xmlTextWriterWriteFormatElement (writer,
+                                           (xmlChar *)"state", "%d", state_id);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+                }
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "friend%d.state", i);
+                ret = dict_get_str (dict, key, &state_str);
+                if (!ret) {
+                        /* ignore */
+
+                        ret = xmlTextWriterWriteFormatElement (writer,
+                                       (xmlChar *)"stateStr", "%s", state_str);
+                        XML_RET_CHECK_AND_GOTO (ret, out);
+                }
+
+                /* </peer> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                i++;
+        }
+
+cont:
+        /* </peerStatus> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = cli_end_xml_output (writer, doc);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+#if (HAVE_LIB_XML)
+/* Used for rebalance stop/status, remove-brick status */
+int
+cli_xml_output_vol_rebalance_status (xmlTextWriterPtr writer, dict_t *dict,
+                                     enum gf_task_types task_type)
+{
+        int                     ret = -1;
+        int                     count = 0;
+        char                    *node_name = NULL;
+        char                    *node_uuid = NULL;
+        uint64_t                files = 0;
+        uint64_t                size = 0;
+        uint64_t                lookups = 0;
+        int                     status_rcd = 0;
+        uint64_t                failures = 0;
+        uint64_t                skipped = 0;
+        uint64_t                total_files = 0;
+        uint64_t                total_size = 0;
+        uint64_t                total_lookups = 0;
+        uint64_t                total_failures = 0;
+        uint64_t                total_skipped = 0;
+        char                    key[1024] = {0,};
+        int                     i = 0;
+        int                     overall_status = -1;
+        double                  elapsed = 0;
+        double                  overall_elapsed = 0;
+
+        if (!dict) {
+                ret = 0;
+                goto out;
+        }
+
+        ret = dict_get_int32 (dict, "count", &count);
+        if (ret)
+                goto out;
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"nodeCount",
+                                               "%d", count);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        while (i < count) {
+                i++;
+
+                /* <node> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"node");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "node-name-%d", i);
+                ret = dict_get_str (dict, key, &node_name);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"nodeName",
+                                                       "%s", node_name);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "node-uuid-%d", i);
+                ret = dict_get_str (dict, key, &node_uuid);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"id",
+                                                       "%s", node_uuid);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "files-%d", i);
+                ret = dict_get_uint64 (dict, key, &files);
+                if (ret)
+                        goto out;
+                total_files += files;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"files",
+                                                       "%"PRIu64, files);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "size-%d", i);
+                ret = dict_get_uint64 (dict, key, &size);
+                if (ret)
+                        goto out;
+                total_size += size;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"size",
+                                                       "%"PRIu64,size);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "lookups-%d", i);
+                ret = dict_get_uint64 (dict, key, &lookups);
+                if (ret)
+                        goto out;
+                total_lookups += lookups;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"lookups",
+                                                       "%"PRIu64, lookups);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "failures-%d", i);
+                ret = dict_get_uint64 (dict, key, &failures);
+                if (ret)
+                        goto out;
+                total_failures += failures;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"failures",
+                                                       "%"PRIu64, failures);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* skipped-%d is not available for remove brick in dict,
+                   so using failures as skipped count in case of remove-brick
+                   similar to logic used in CLI(non xml output) */
+                if (task_type == GF_TASK_TYPE_REBALANCE) {
+                    memset (key, 0, sizeof (key));
+                    snprintf (key, sizeof (key), "skipped-%d", i);
+                }
+                else {
+                    memset (key, 0, sizeof (key));
+                    snprintf (key, sizeof (key), "failures-%d", i);
+                }
+
+                ret = dict_get_uint64 (dict, key, &skipped);
+                if (ret)
+                        goto out;
+                total_skipped += skipped;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"skipped",
+                                                       "%"PRIu64, skipped);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                memset (key, 0, sizeof (key));
+                snprintf (key, sizeof (key), "status-%d", i);
+                ret = dict_get_int32 (dict, key, &status_rcd);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"status",
+                                                       "%d", status_rcd);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"statusStr",
+                                                       "%s",
+                                         cli_vol_task_status_str[status_rcd]);
+
+                memset (key, 0, 256);
+                snprintf (key, 256, "run-time-%d", i);
+                ret = dict_get_double (dict, key, &elapsed);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"runtime",
+                                                       "%.2f", elapsed);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                if (elapsed > overall_elapsed) {
+                    overall_elapsed = elapsed;
+                }
+
+                if (-1 == overall_status)
+                        overall_status = status_rcd;
+                else if ((GF_DEFRAG_STATUS_COMPLETE == overall_status ||
+                          status_rcd > overall_status) &&
+                         (status_rcd != GF_DEFRAG_STATUS_COMPLETE))
+                        overall_status = status_rcd;
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* </node> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        /* Aggregate status */
+        /* <aggregate> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"aggregate");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"files",
+                                               "%"PRIu64, total_files);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"size",
+                                               "%"PRIu64, total_size);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"lookups",
+                                               "%"PRIu64, total_lookups);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"failures",
+                                               "%"PRIu64, total_failures);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"skipped",
+                                               "%"PRIu64, total_skipped);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"status",
+                                               "%d", overall_status);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"statusStr",
+                                               "%s",
+                                      cli_vol_task_status_str[overall_status]);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = xmlTextWriterWriteFormatElement (writer,(xmlChar *)"runtime",
+                                               "%.2f", overall_elapsed);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        /* </aggregate> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+#endif
+
+int
+cli_xml_output_vol_rebalance (gf_cli_defrag_type op, dict_t *dict, int op_ret,
+                              int op_errno, char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+        char                    *task_id_str = NULL;
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        /* <volRebalance> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"volRebalance");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_str (dict, GF_REBALANCE_TID_KEY, &task_id_str);
+        if (ret == 0) {
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"task-id",
+                                                       "%s", task_id_str);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"op",
+                                               "%d", op);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        if ((GF_DEFRAG_CMD_STOP == op) || (GF_DEFRAG_CMD_STATUS == op)) {
+                ret = cli_xml_output_vol_rebalance_status (writer, dict,
+                                                      GF_TASK_TYPE_REBALANCE);
+                if (ret)
+                        goto out;
+        }
+
+        /* </volRebalance> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+
+        ret = cli_end_xml_output (writer, doc);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+int
+cli_xml_output_vol_remove_brick (gf_boolean_t status_op, dict_t *dict,
+                                 int op_ret, int op_errno, char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+        char                    *task_id_str = NULL;
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        /* <volRemoveBrick> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"volRemoveBrick");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_str (dict, GF_REMOVE_BRICK_TID_KEY, &task_id_str);
+        if (ret == 0) {
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"task-id",
+                                                       "%s", task_id_str);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        if (status_op) {
+                ret = cli_xml_output_vol_rebalance_status (writer, dict,
+                                                   GF_TASK_TYPE_REMOVE_BRICK);
+                if (ret)
+                        goto out;
+        }
+
+        /* </volRemoveBrick> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+
+        ret = cli_end_xml_output (writer, doc);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+int
+cli_xml_output_vol_replace_brick (gf1_cli_replace_op op, dict_t *dict,
+                                  int op_ret, int op_errno, char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        int                     status = 0;
+        uint64_t                files = 0;
+        char                    *current_file = 0;
+        char                    *task_id_str = NULL;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        /* <volReplaceBrick> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"volReplaceBrick");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_str (dict, GF_REPLACE_BRICK_TID_KEY, &task_id_str);
+        if (ret == 0) {
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"task-id",
+                                                       "%s", task_id_str);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"op",
+                                               "%d", op);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        if (GF_REPLACE_OP_STATUS == op) {
+                ret = dict_get_int32 (dict, "status", &status);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"status",
+                                                       "%d", status);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = dict_get_uint64 (dict, "files", &files);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"files",
+                                                       "%"PRIu64, files);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                if (status)
+                        goto cont;
+
+                ret = dict_get_str (dict, "current_file", &current_file);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"currentFile",
+                                                       "%s", current_file);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+cont:
+        /* </volReplaceBrick> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = cli_end_xml_output (writer, doc);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+int
+cli_xml_output_vol_create (dict_t *dict, int op_ret, int op_errno,
+                           char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+        char                    *volname = NULL;
+        char                    *volid = NULL;
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        if (dict) {
+                /* <volCreate> */
+                ret = xmlTextWriterStartElement (writer,
+                                                 (xmlChar *)"volCreate");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* <volume> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"volume");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = dict_get_str (dict, "volname", &volname);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *) "name",
+                                                       "%s", volname);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = dict_get_str (dict, "volume-id", &volid);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"id",
+                                                       "%s", volid);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* </volume> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* </volCreate> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        ret = cli_end_xml_output (writer, doc);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+int
+cli_xml_output_generic_volume (char *op, dict_t *dict, int op_ret, int op_errno,
+                               char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+        char                    *volname = NULL;
+        char                    *volid = NULL;
+
+        GF_ASSERT (op);
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        if (dict) {
+                /* <"op"> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)op);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* <volume> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"volume");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = dict_get_str (dict, "volname", &volname);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *) "name",
+                                                       "%s", volname);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = dict_get_str (dict, "vol-id", &volid);
+                if (ret)
+                        goto out;
+                ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"id",
+                                                       "%s", volid);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* </volume> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* </"op"> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+        }
+
+        ret = cli_end_xml_output (writer, doc);
+
+out:
+        gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
+
+#if (HAVE_LIB_XML)
+int
+cli_xml_output_vol_gsync_status (dict_t *dict, xmlTextWriterPtr writer)
+{
+        char                    master_key[PATH_MAX] = "";
+        char                    slave_key[PATH_MAX] = "";
+        char                    status_key[PATH_MAX] = "";
+        char                    node_key[PATH_MAX] = "";
+        char                    *master = NULL;
+        char                    *slave = NULL;
+        char                    *status = NULL;
+        char                    *node = NULL;
+        int                     ret = -1;
+        int                     gsync_count = 0;
+        int                     i = 1;
+
+        ret = dict_get_int32 (dict, "gsync-count", &gsync_count);
+        if (ret)
+                goto out;
+
+        for (i=1; i <= gsync_count; i++) {
+                snprintf (node_key, sizeof(node_key), "node%d", i);
+                snprintf (master_key, sizeof(master_key), "master%d", i);
+                snprintf (slave_key, sizeof(slave_key), "slave%d", i);
+                snprintf (status_key, sizeof(status_key), "status%d", i);
+
+                ret = dict_get_str (dict, node_key, &node);
+                if (ret)
+                        goto out;
+
+                ret = dict_get_str (dict, master_key, &master);
+                if (ret)
+                        goto out;
+
+                ret = dict_get_str (dict, slave_key, &slave);
+                if (ret)
+                        goto out;
+
+                ret = dict_get_str (dict, status_key, &status);
+                if (ret)
+                        goto out;
+
+                /* <pair> */
+                ret = xmlTextWriterStartElement (writer, (xmlChar *)"pair");
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"node",
+                                                       "%s", node);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"master",
+                                                       "%s", master);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"slave",
+                                                       "%s", slave);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"status",
+                                                       "%s", status);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                /* </pair> */
+                ret = xmlTextWriterEndElement (writer);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+        }
+
+out:
+        gf_log ("cli",GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+}
+#endif
+
+int
+cli_xml_output_vol_gsync (dict_t *dict, int op_ret, int op_errno,
+                          char *op_errstr)
+{
+#if (HAVE_LIB_XML)
+        int                     ret = -1;
+        xmlTextWriterPtr        writer = NULL;
+        xmlDocPtr               doc = NULL;
+        char                    *master = NULL;
+        char                    *slave = NULL;
+        int                     type = 0;
+
+        GF_ASSERT (dict);
+
+        ret = cli_begin_xml_output (&writer, &doc);
+        if (ret)
+                goto out;
+
+        ret = cli_xml_output_common (writer, op_ret, op_errno, op_errstr);
+        if (ret)
+                goto out;
+
+        /* <geoRep> */
+        ret = xmlTextWriterStartElement (writer, (xmlChar *)"geoRep");
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = dict_get_int32 (dict, "type", &type);
+        if (ret) {
+                gf_log ("cli", GF_LOG_ERROR, "Failed to get type");
+                goto out;
+        }
+
+        ret = xmlTextWriterWriteFormatElement (writer, (xmlChar *)"type",
+                                               "%d", type);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        switch (type) {
+        case GF_GSYNC_OPTION_TYPE_START:
+        case GF_GSYNC_OPTION_TYPE_STOP:
+                if (dict_get_str (dict, "master", &master) != 0)
+                        master = "???";
+                if (dict_get_str (dict, "slave", &slave) != 0)
+                        slave = "???";
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"master",
+                                                       "%s", master);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                ret = xmlTextWriterWriteFormatElement (writer,
+                                                       (xmlChar *)"slave",
+                                                       "%s", slave);
+                XML_RET_CHECK_AND_GOTO (ret, out);
+
+                break;
+
+        case GF_GSYNC_OPTION_TYPE_CONFIG:
+                break;
+        case GF_GSYNC_OPTION_TYPE_STATUS:
+                ret = cli_xml_output_vol_gsync_status(dict, writer);
+                break;
+        default:
+                ret = 0;
+                break;
+        }
+
+        /* </geoRep> */
+        ret = xmlTextWriterEndElement (writer);
+        XML_RET_CHECK_AND_GOTO (ret, out);
+
+        ret = cli_end_xml_output (writer, doc);
+out:
+        gf_log ("cli",GF_LOG_DEBUG, "Returning %d", ret);
+        return ret;
+#else
+        return 0;
+#endif
+}
diff --git a/cli/src/cli.c b/cli/src/cli.c
index 932869038..91b315ff1 100644
--- a/cli/src/cli.c
+++ b/cli/src/cli.c
@@ -1,22 +1,12 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -99,43 +89,6 @@ rpc_clnt_prog_t *cli_rpc_prog;
 
 extern struct rpc_clnt_program cli_prog;
 
-
-
-
-static char *
-generate_uuid ()
-{
-        char           tmp_str[1024] = {0,};
-        char           hostname[256] = {0,};
-        struct timeval tv = {0,};
-        struct tm      now = {0, };
-        char           now_str[32];
-
-        if (gettimeofday (&tv, NULL) == -1) {
-                gf_log ("glusterfsd", GF_LOG_ERROR,
-                        "gettimeofday: failed %s",
-                        strerror (errno));
-        }
-
-        if (gethostname (hostname, 256) == -1) {
-                gf_log ("glusterfsd", GF_LOG_ERROR,
-                        "gethostname: failed %s",
-                        strerror (errno));
-        }
-
-        localtime_r (&tv.tv_sec, &now);
-        strftime (now_str, 32, "%Y/%m/%d-%H:%M:%S", &now);
-        snprintf (tmp_str, 1024, "%s-%d-%s:%"
-#ifdef GF_DARWIN_HOST_OS
-                  PRId32,
-#else
-                  "ld",
-#endif
-                  hostname, getpid(), now_str, tv.tv_usec);
-
-        return gf_strdup (tmp_str);
-}
-
 static int
 glusterfs_ctx_defaults_init (glusterfs_ctx_t *ctx)
 {
@@ -145,13 +98,13 @@ glusterfs_ctx_defaults_init (glusterfs_ctx_t *ctx)
 
         xlator_mem_acct_init (THIS, cli_mt_end);
 
-        ctx->process_uuid = generate_uuid ();
+        ctx->process_uuid = generate_glusterfs_ctx_id ();
         if (!ctx->process_uuid)
                 return -1;
 
         ctx->page_size  = 128 * GF_UNIT_KB;
 
-        ctx->iobuf_pool = iobuf_pool_new (8 * GF_UNIT_MB, ctx->page_size);
+        ctx->iobuf_pool = iobuf_pool_new ();
         if (!ctx->iobuf_pool)
                 return -1;
 
@@ -164,22 +117,33 @@ glusterfs_ctx_defaults_init (glusterfs_ctx_t *ctx)
         if (!pool)
                 return -1;
 
-        /* frame_mem_pool size 112 * 16k */
-        pool->frame_mem_pool = mem_pool_new (call_frame_t, 16384);
-
+        /* frame_mem_pool size 112 * 64 */
+        pool->frame_mem_pool = mem_pool_new (call_frame_t, 32);
         if (!pool->frame_mem_pool)
                 return -1;
 
-        /* stack_mem_pool size 256 * 8k */
-        pool->stack_mem_pool = mem_pool_new (call_stack_t, 8192); 
+        /* stack_mem_pool size 256 * 128 */
+        pool->stack_mem_pool = mem_pool_new (call_stack_t, 16);
 
         if (!pool->stack_mem_pool)
                 return -1;
 
-        ctx->stub_mem_pool = mem_pool_new (call_stub_t, 1024);
+        ctx->stub_mem_pool = mem_pool_new (call_stub_t, 16);
         if (!ctx->stub_mem_pool)
                 return -1;
 
+        ctx->dict_pool = mem_pool_new (dict_t, 32);
+        if (!ctx->dict_pool)
+                return -1;
+
+        ctx->dict_pair_pool = mem_pool_new (data_pair_t, 512);
+        if (!ctx->dict_pair_pool)
+                return -1;
+
+        ctx->dict_data_pool = mem_pool_new (data_t, 512);
+        if (!ctx->dict_data_pool)
+                return -1;
+
         INIT_LIST_HEAD (&pool->all_frames);
         LOCK_INIT (&pool->lock);
         ctx->pool = pool;
@@ -199,12 +163,13 @@ glusterfs_ctx_defaults_init (glusterfs_ctx_t *ctx)
 
 
 static int
-logging_init (struct cli_state *state)
+logging_init (glusterfs_ctx_t *ctx, struct cli_state *state)
 {
         char *log_file = state->log_file ? state->log_file :
                          DEFAULT_CLI_LOG_FILE_DIRECTORY "/cli.log";
 
-        if (gf_log_init (log_file) == -1) {
+        /* passing ident as NULL means to use default ident for syslog */
+        if (gf_log_init (ctx, log_file, NULL) == -1) {
                 fprintf (stderr, "ERROR: failed to open logfile %s\n",
                          log_file);
                 return -1;
@@ -273,6 +238,8 @@ cli_submit_request (void *req, call_frame_t *frame,
 out:
         if (new_iobref)
                 iobref_unref (iobref);
+        if (iobuf)
+                iobuf_unref (iobuf);
         return ret;
 }
 
@@ -317,16 +284,44 @@ cli_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
         return ret;
 }
 
+
+/*
+ * ret: 0: option successfully processed
+ *      1: signalling end of option list
+ *     -1: unknown option or other issue
+ */
 int
 cli_opt_parse (char *opt, struct cli_state *state)
 {
         char *oarg;
 
+        if (strcmp (opt, "") == 0)
+                return 1;
+
         if (strcmp (opt, "version") == 0) {
-                puts (argp_program_version);
+                cli_out ("%s", argp_program_version);
                 exit (0);
         }
 
+        if (strcmp (opt, "print-logdir") == 0) {
+                cli_out ("%s", DEFAULT_LOG_FILE_DIRECTORY);
+                exit (0);
+        }
+
+        if (strcmp (opt, "print-statedumpdir") == 0) {
+                cli_out ("%s", DEFAULT_VAR_RUN_DIRECTORY);
+                exit (0);
+        }
+
+        if (strcmp (opt, "xml") == 0) {
+#if (HAVE_LIB_XML)
+                state->mode |= GLUSTER_MODE_XML;
+#else
+                cli_err ("XML output not supported. Ignoring '--xml' option");
+#endif
+                return 0;
+        }
+
         oarg = strtail (opt, "mode=");
         if (oarg) {
                 if (strcmp (oarg, "script") == 0) {
@@ -358,6 +353,12 @@ cli_opt_parse (char *opt, struct cli_state *state)
                 return 0;
         }
 
+        oarg = strtail (opt, "glusterd-sock=");
+        if (oarg) {
+                state->glusterd_sock = oarg;
+                return 0;
+        }
+
         return -1;
 }
 
@@ -385,9 +386,16 @@ parse_cmdline (int argc, char *argv[], struct cli_state *state)
                         state->argc--;
                         /* argv shifted, next check should be at i again */
                         i--;
+                        if (ret == 1) {
+                                /* end of cli options */
+                                ret = 0;
+                                break;
+                        }
                 }
         }
 
+        state->argv[state->argc] = NULL;
+
         return ret;
 }
 
@@ -412,7 +420,6 @@ cli_state_init (struct cli_state *state)
         int                   ret = 0;
 
 
-        state->remote_host = "localhost";
         state->log_level = -1;
 
         tree = &state->tree;
@@ -432,11 +439,35 @@ cli_usage_out (const char *usage)
         if (!usage || usage[0] == '\0')
                 return -1;
 
-        cli_out ("Usage: %s", usage);
+        cli_err ("Usage: %s", usage);
         return 0;
 }
 
 int
+_cli_err (const char *fmt, ...)
+{
+        struct cli_state *state = NULL;
+        va_list           ap;
+        int               ret = 0;
+
+        state = global_state;
+
+        va_start (ap, fmt);
+
+#ifdef HAVE_READLINE
+        if (state->rl_enabled && !state->rl_processing)
+                return cli_rl_err(state, fmt, ap);
+#endif
+
+        ret = vfprintf (stderr, fmt, ap);
+        fprintf (stderr, "\n");
+        va_end (ap);
+
+        return ret;
+}
+
+
+int
 _cli_out (const char *fmt, ...)
 {
         struct cli_state *state = NULL;
@@ -475,23 +506,46 @@ cli_rpc_init (struct cli_state *state)
         if (!options)
                 goto out;
 
-        ret = dict_set_str (options, "remote-host", state->remote_host);
-        if (ret)
-                goto out;
-
-        if (state->remote_port)
-                port = state->remote_port;
+        /* Connect using to glusterd using the specified method, giving
+         * preference to unix socket connection. If nothing is specified connect
+         * to the default glusterd socket
+         */
+        if (state->glusterd_sock) {
+                gf_log ("cli", GF_LOG_INFO, "Connecting to glusterd using "
+                        "sockfile %s", state->glusterd_sock);
+                ret = rpc_transport_unix_options_build (&options,
+                                                        state->glusterd_sock,
+                                                        0);
+                if (ret)
+                        goto out;
+        } else if (state->remote_host) {
+                gf_log ("cli", GF_LOG_INFO, "Connecting to remote glusterd at "
+                        "%s", state->remote_host);
+                ret = dict_set_str (options, "remote-host", state->remote_host);
+                if (ret)
+                        goto out;
 
-        ret = dict_set_int32 (options, "remote-port", port);
-        if (ret)
-                goto out;
+                if (state->remote_port)
+                        port = state->remote_port;
 
-        ret = dict_set_str (options, "transport.address-family", "inet/inet6");
-        if (ret)
-                goto out;
+                ret = dict_set_int32 (options, "remote-port", port);
+                if (ret)
+                        goto out;
 
-        rpc = rpc_clnt_new (options, this->ctx, this->name);
+                ret = dict_set_str (options, "transport.address-family",
+                                    "inet");
+                if (ret)
+                        goto out;
+        } else {
+                gf_log ("cli", GF_LOG_DEBUG, "Connecting to glusterd using "
+                        "default socket");
+                ret = rpc_transport_unix_options_build
+                        (&options, DEFAULT_GLUSTERD_SOCKFILE, 0);
+                if (ret)
+                        goto out;
+        }
 
+        rpc = rpc_clnt_new (options, this->ctx, this->name, 16);
         if (!rpc)
                 goto out;
 
@@ -501,7 +555,7 @@ cli_rpc_init (struct cli_state *state)
                 goto out;
         }
 
-        rpc_clnt_start (rpc);
+        ret = rpc_clnt_start (rpc);
 out:
         if (ret) {
                 if (rpc)
@@ -525,76 +579,15 @@ void
 cli_local_wipe (cli_local_t *local)
 {
         if (local) {
+                GF_FREE (local->get_vol.volname);
+                if (local->dict)
+                        dict_unref (local->dict);
                 GF_FREE (local);
         }
 
         return;
 }
 
-/* If the path exists use realpath(3) to handle extra slashes and to resolve
- * symlinks else strip the extra slashes in the path and return */
-
-int
-cli_canonicalize_path (char *path)
-{
-        struct stat     sb = {0};
-        int             ret = -1;
-        char            *tmppath = NULL;
-        char            *dir = NULL;
-        char            *tmpstr = NULL;
-        int             path_len = 0;
-
-        if (!path)
-                return ret;
-
-        ret = stat (path, &sb);
-        if (ret == -1) {
-                /* Strip the extra slashes and return */
-                tmppath = gf_strdup (path);
-                if (tmppath == NULL) {
-                        ret = -1;
-                        gf_log ("cli", GF_LOG_ERROR, "Out of memory.");
-                        goto out;
-                }
-                bzero (path, strlen(path));
-                path[0] = '/';
-                dir = strtok_r(tmppath, "/", &tmpstr);
-                while (dir) {
-                        strncpy ((path + path_len + 1), dir, strlen(dir));
-                        path_len = strlen (path);
-                        dir = strtok_r(NULL, "/", &tmpstr);
-                        if (dir)
-                                strncpy((path + path_len), "/", 1);
-                }
-                if (path_len == 0)
-                        path[1] = '\0';
-                else
-                        path[path_len] = '\0';
-                ret = 0;
-                goto out;
-        } else {
-                tmppath = gf_strdup(path);
-                if (tmppath == NULL) {
-                        ret = -1;
-                        gf_log ("cli", GF_LOG_ERROR, "Out of memory.");
-                        goto out;
-                }
-                if (realpath (tmppath, path) == NULL) {
-                        cli_out ("Path manipulation failed: %s",
-                                 strerror(errno));
-                        gf_log ("cli", GF_LOG_ERROR, "Path manipulation "
-                                 "failed: %s", strerror(errno));
-                        ret = -1;
-                        goto out;
-                }
-                ret = 0;
-        }
-out:
-        if (tmppath)
-                GF_FREE(tmppath);
-        return ret;
-}
-
 struct cli_state *global_state;
 
 int
@@ -604,13 +597,19 @@ main (int argc, char *argv[])
         int                ret = -1;
         glusterfs_ctx_t   *ctx = NULL;
 
-        ret = glusterfs_globals_init ();
+        ctx = glusterfs_ctx_new ();
+        if (!ctx)
+                return ENOMEM;
+
+#ifdef DEBUG
+        gf_mem_acct_enable_set (ctx);
+#endif
+
+        ret = glusterfs_globals_init (ctx);
         if (ret)
                 return ret;
 
-        ctx = glusterfs_ctx_get ();
-        if (!ctx)
-                return ENOMEM;
+	THIS->ctx = ctx;
 
         ret = glusterfs_ctx_defaults_init (ctx);
         if (ret)
@@ -627,7 +626,7 @@ main (int argc, char *argv[])
         if (ret)
                 goto out;
 
-        ret = logging_init (&state);
+        ret = logging_init (ctx, &state);
         if (ret)
                 goto out;
 
diff --git a/cli/src/cli.h b/cli/src/cli.h
index 0a2fdb54b..8daa4b741 100644
--- a/cli/src/cli.h
+++ b/cli/src/cli.h
@@ -1,22 +1,12 @@
 /*
-   Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com>
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
    This file is part of GlusterFS.
 
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
 */
-
 #ifndef __CLI_H__
 #define __CLI_H__
 
@@ -28,16 +18,24 @@
 #include "rpc-clnt.h"
 #include "glusterfs.h"
 #include "protocol-common.h"
+#include "logging.h"
+
+#include "cli1-xdr.h"
+
+#if (HAVE_LIB_XML)
+#include <libxml/encoding.h>
+#include <libxml/xmlwriter.h>
+#endif
 
 #define DEFAULT_EVENT_POOL_SIZE            16384
 #define CLI_GLUSTERD_PORT                  24007
 #define CLI_DEFAULT_CONN_TIMEOUT             120
 #define CLI_DEFAULT_CMD_TIMEOUT              120
-#define CLI_TOP_CMD_TIMEOUT                  300 //Longer timeout for volume top
+#define CLI_TEN_MINUTES_TIMEOUT              600 //Longer timeout for volume top
 #define DEFAULT_CLI_LOG_FILE_DIRECTORY     DATADIR "/log/glusterfs"
 #define CLI_VOL_STATUS_BRICK_LEN              55
 #define CLI_TAB_LENGTH                         8
-#define CLI_BRICK_STATUS_LINE_LEN             75
+#define CLI_BRICK_STATUS_LINE_LEN             78
 
 enum argp_option_keys {
 	ARGP_DEBUG_KEY = 133,
@@ -46,12 +44,16 @@ enum argp_option_keys {
 
 #define GLUSTER_MODE_SCRIPT    (1 << 0)
 #define GLUSTER_MODE_ERR_FATAL (1 << 1)
-
+#define GLUSTER_MODE_XML       (1 << 2)
 struct cli_state;
 struct cli_cmd_word;
 struct cli_cmd_tree;
 struct cli_cmd;
 
+extern char *cli_vol_type_str[];
+extern char *cli_vol_status_str[];
+extern char *cli_vol_task_status_str[];
+
 typedef int (cli_cmd_cbk_t)(struct cli_state *state,
                             struct cli_cmd_word *word,
                             const char **words,
@@ -113,49 +115,66 @@ struct cli_state {
 
         char                 *log_file;
         gf_loglevel_t         log_level;
+
+        char                 *glusterd_sock;
 };
 
 struct cli_local {
-        union {
-                struct {
-                        dict_t  *dict;
-                } create_vol;
-
-                struct {
-                        char    *volname;
-                        int     flags;
-                } start_vol;
-
-                struct {
-                        char    *volname;
-                        int     flags;
-                } stop_vol;
-
-                struct {
-                        char    *volname;
-                } delete_vol;
-
-                struct {
-                        char    *volname;
-                        int      cmd;
-                } defrag_vol;
-
-                struct {
-                        char    *volname;
-                        dict_t  *dict;
-                } replace_brick;
-
-                struct {
-                        char    *volname;
-                        int     flags;
-                } get_vol;
-
-                struct {
-                        char    *volname;
-                }heal_vol;
-        } u;
+        struct {
+                char    *volname;
+                int     flags;
+        } get_vol;
+
+        dict_t          *dict;
+        const char      **words;
+        /* Marker for volume status all */
+        gf_boolean_t    all;
+#if (HAVE_LIB_XML)
+        xmlTextWriterPtr        writer;
+        xmlDocPtr               doc;
+        int                     vol_count;
+#endif
 };
 
+struct gf_cli_gsync_detailed_status_ {
+        char *node;
+        char *master;
+        char *slave;
+        char *health;
+        char *uptime;
+        char *files_syncd;
+        char *files_pending;
+        char *bytes_pending;
+        char *deletes_pending;
+};
+
+struct cli_volume_status {
+        int            port;
+        int            online;
+        uint64_t       block_size;
+        uint64_t       total_inodes;
+        uint64_t       free_inodes;
+        char          *brick;
+        char          *pid_str;
+        char          *free;
+        char          *total;
+#ifdef GF_LINUX_HOST_OS
+        char          *fs_name;
+        char          *mount_options;
+        char          *device;
+        char          *inode_size;
+#endif
+};
+
+struct snap_config_opt_vals_ {
+        char           *op_name;
+        char           *question;
+};
+
+typedef struct gf_cli_gsync_detailed_status_ gf_cli_gsync_status_t;
+
+typedef struct cli_volume_status cli_volume_status_t;
+
 typedef struct cli_local cli_local_t;
 
 typedef ssize_t (*cli_serialize_t) (struct iovec outmsg, void *args);
@@ -176,10 +195,12 @@ int cli_cmd_process_line (struct cli_state *state, const char *line);
 
 int cli_rl_enable (struct cli_state *state);
 int cli_rl_out (struct cli_state *state, const char *fmt, va_list ap);
+int cli_rl_err (struct cli_state *state, const char *fmt, va_list ap);
 
 int cli_usage_out (const char *usage);
 
 int _cli_out (const char *fmt, ...);
+int _cli_err (const char *fmt, ...);
 
 #define cli_out(fmt...) do {                       \
                 FMT_WARN (fmt);                    \
@@ -188,6 +209,13 @@ int _cli_out (const char *fmt, ...);
                                                    \
         } while (0)
 
+#define cli_err(fmt...) do {                       \
+                FMT_WARN (fmt);                    \
+                                                   \
+                _cli_err(fmt);                     \
+                                                   \
+        } while (0)
+
 int
 cli_submit_request (void *req, call_frame_t *frame,
                     rpc_clnt_prog_t *prog,
@@ -209,7 +237,7 @@ cli_cmd_quota_parse (const char **words, int wordcount, dict_t **opt);
 
 int32_t
 cli_cmd_volume_set_parse (const char **words, int wordcount,
-                          dict_t **options);
+                          dict_t **options, char **op_errstr);
 
 int32_t
 cli_cmd_volume_add_brick_parse (const char **words, int wordcount,
@@ -233,6 +261,9 @@ cli_cmd_log_filename_parse (const char **words, int wordcount, dict_t **options)
 int32_t
 cli_cmd_volume_statedump_options_parse (const char **words, int wordcount,
                                         dict_t **options);
+int32_t
+cli_cmd_volume_clrlks_opts_parse (const char **words, int wordcount,
+                                  dict_t **options);
 
 cli_local_t * cli_local_get ();
 
@@ -249,9 +280,6 @@ int
 cli_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
                 void *data);
 
-int
-cli_canonicalize_path (char *path);
-
 int32_t
 cli_cmd_volume_profile_parse (const char **words, int wordcount,
                               dict_t **options);
@@ -268,8 +296,105 @@ cli_cmd_volume_status_parse (const char **words, int wordcount,
                              dict_t **options);
 
 int
-cli_print_brick_status (char *brick, int port, int online, int pid);
+cli_cmd_volume_heal_options_parse (const char **words, int wordcount,
+                                   dict_t **options);
+
+int
+cli_cmd_volume_defrag_parse (const char **words, int wordcount,
+                             dict_t **options);
+
+int
+cli_print_brick_status (cli_volume_status_t *status);
+
+void
+cli_print_detailed_status (cli_volume_status_t *status);
+
+int
+cli_get_detail_status (dict_t *dict, int i, cli_volume_status_t *status);
 
 void
 cli_print_line (int len);
+
+int
+cli_xml_output_str (char *op, char *str, int op_ret, int op_errno,
+                    char *op_errstr);
+
+int
+cli_xml_output_dict (char *op, dict_t *dict, int op_ret, int op_errno,
+                     char *op_errstr);
+
+int
+cli_xml_output_vol_top (dict_t *dict, int op_ret, int op_errno,
+                        char *op_errstr);
+
+int
+cli_xml_output_vol_profile (dict_t *dict, int op_ret, int op_errno,
+                            char *op_errstr);
+
+int
+cli_xml_output_vol_status_begin (cli_local_t *local, int op_ret, int op_errno,
+                                 char *op_errstr);
+
+int
+cli_xml_output_vol_status_end (cli_local_t *local);
+
+int
+cli_xml_output_vol_status (cli_local_t *local, dict_t *dict);
+
+int
+cli_xml_output_vol_list (dict_t *dict, int op_ret, int op_errno,
+                         char *op_errstr);
+
+int
+cli_xml_output_vol_info_begin (cli_local_t *local, int op_ret, int op_errno,
+                               char *op_errstr);
+
+int
+cli_xml_output_vol_info_end (cli_local_t *local);
+
+int
+cli_xml_output_vol_info (cli_local_t *local, dict_t *dict);
+
+int
+cli_xml_output_vol_quota_limit_list (char *volname, char *limit_list,
+                                      int op_ret, int op_errno,
+                                      char *op_errstr);
+
+int
+cli_xml_output_peer_status (dict_t *dict, int op_ret, int op_errno,
+                            char *op_errstr);
+
+int
+cli_xml_output_vol_rebalance (gf_cli_defrag_type op, dict_t *dict, int op_ret,
+                              int op_errno, char *op_errstr);
+
+int
+cli_xml_output_vol_remove_brick (gf_boolean_t status_op, dict_t *dict,
+                                 int op_ret, int op_errno, char *op_errstr);
+
+int
+cli_xml_output_vol_replace_brick (gf1_cli_replace_op op, dict_t *dict,
+                                  int op_ret, int op_errno, char *op_errstr);
+
+int
+cli_xml_output_vol_create (dict_t *dict, int op_ret, int op_errno,
+                           char *op_errstr);
+
+int
+cli_xml_output_generic_volume (char *op, dict_t *dict, int op_ret, int op_errno,
+                               char *op_errstr);
+
+int
+cli_xml_output_vol_gsync (dict_t *dict, int op_ret, int op_errno,
+                          char *op_errstr);
+int
+cli_xml_output_vol_status_tasks_detail (cli_local_t *local, dict_t *dict);
+
+char *
+is_server_debug_xlator (void *myframe);
+
+int32_t
+cli_cmd_snapshot_parse (const char **words, int wordcount, dict_t **options,
+                        struct cli_state *state);
+
 #endif /* __CLI_H__ */
diff --git a/cli/src/input.c b/cli/src/input.c
index a88d35874..a8ea46c6d 100644
--- a/cli/src/input.c
+++ b/cli/src/input.c
@@ -1,22 +1,12 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -44,7 +34,7 @@ cli_batch (void *d)
         ret = cli_cmd_process (state, state->argc, state->argv);
 
         gf_log ("", GF_LOG_INFO, "Exiting with: %d", ret);
-        exit (ret);
+        exit (-ret);
 
         return NULL;
 }
@@ -71,11 +61,11 @@ cli_input (void *d)
                 if (len > 0 && cmd[len - 1] == '\n') //strip trailing \n
                         cmd[len - 1] = '\0';
                 ret = cli_cmd_process_line (state, cmd);
-                if (ret == -1 && state->mode & GLUSTER_MODE_ERR_FATAL)
+                if (ret != 0 && state->mode & GLUSTER_MODE_ERR_FATAL)
                         break;
         }
 
-        exit (ret);
+        exit (-ret);
 
         return NULL;
 }
diff --git a/cli/src/registry.c b/cli/src/registry.c
index 5b63e82b1..c4abe3be0 100644
--- a/cli/src/registry.c
+++ b/cli/src/registry.c
@@ -1,22 +1,12 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 
 #ifndef _CONFIG_H
 #define _CONFIG_H
@@ -387,7 +377,7 @@ cli_cmd_register (struct cli_cmd_tree *tree, struct cli_cmd *cmd)
         char **tokens = NULL;
         int    ret = 0;
 
-        GF_ASSERT (cmd)
+        GF_ASSERT (cmd);
 
         if (cmd->reg_cbk)
                 cmd->reg_cbk (cmd);
diff --git a/configure.ac b/configure.ac
index 2a34a61af..b3d1ed184 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,21 +1,12 @@
-dnl Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com>
-
-dnl This file is part of GlusterFS.
-dnl
-dnl GlusterFS is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU General Public License as published by
-dnl the Free Software Foundation; either version 3 of the License, or
-dnl (at your option) any later version.
+dnl  Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+dnl  This file is part of GlusterFS.
 dnl
-dnl GlusterFS is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-dnl GNU General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU General Public License
-dnl along with this program.  If not, see <http://www.gnu.org/licenses/>.
+dnl  This file is licensed to you under your choice of the GNU Lesser
+dnl  General Public License, version 3 or any later version (LGPLv3 or
+dnl  later), or the GNU General Public License, version 2 (GPLv2), in all
+dnl  cases as published by the Free Software Foundation.
 
-AC_INIT([glusterfs],[3git],[gluster-users@gluster.org])
+AC_INIT([glusterfs],[3git],[gluster-users@gluster.org],,[https://github.com/gluster/glusterfs.git])
 
 AM_INIT_AUTOMAKE
 
@@ -33,13 +24,15 @@ if libtool --help 2>&1 | grep -q quiet; then
    AM_LIBTOOLFLAGS="--quiet";
 fi
 
-AM_CONFIG_HEADER([config.h])
+AC_CONFIG_HEADERS([config.h])
 
 AC_CONFIG_FILES([Makefile
-		libglusterfs/Makefile
-		libglusterfs/src/Makefile
-		glusterfsd/Makefile
-		glusterfsd/src/Makefile
+                libglusterfs/Makefile
+                libglusterfs/src/Makefile
+                geo-replication/src/peer_gsec_create
+                geo-replication/src/peer_add_secret_pub
+                glusterfsd/Makefile
+                glusterfsd/src/Makefile
                 rpc/Makefile
                 rpc/rpc-lib/Makefile
                 rpc/rpc-lib/src/Makefile
@@ -50,111 +43,162 @@ AC_CONFIG_FILES([Makefile
                 rpc/rpc-transport/rdma/src/Makefile
                 rpc/xdr/Makefile
                 rpc/xdr/src/Makefile
-		xlators/Makefile
-		xlators/mount/Makefile
-		xlators/mount/fuse/Makefile
-		xlators/mount/fuse/src/Makefile
-		xlators/mount/fuse/utils/mount.glusterfs
-		xlators/mount/fuse/utils/mount_glusterfs
-		xlators/mount/fuse/utils/Makefile
-		xlators/storage/Makefile
-		xlators/storage/posix/Makefile
-		xlators/storage/posix/src/Makefile
-		xlators/cluster/Makefile
-		xlators/cluster/afr/Makefile
-		xlators/cluster/afr/src/Makefile
-		xlators/cluster/stripe/Makefile
-		xlators/cluster/stripe/src/Makefile
-		xlators/cluster/dht/Makefile
-		xlators/cluster/dht/src/Makefile
-		xlators/performance/Makefile
-		xlators/performance/write-behind/Makefile
-		xlators/performance/write-behind/src/Makefile
-		xlators/performance/read-ahead/Makefile
-		xlators/performance/read-ahead/src/Makefile
-		xlators/performance/io-threads/Makefile
-		xlators/performance/io-threads/src/Makefile
-		xlators/performance/io-cache/Makefile
-		xlators/performance/io-cache/src/Makefile
-		xlators/performance/symlink-cache/Makefile
-		xlators/performance/symlink-cache/src/Makefile
-		xlators/performance/quick-read/Makefile
-		xlators/performance/quick-read/src/Makefile
-                xlators/performance/stat-prefetch/Makefile
-                xlators/performance/stat-prefetch/src/Makefile
-		xlators/debug/Makefile
-		xlators/debug/trace/Makefile
-		xlators/debug/trace/src/Makefile
-		xlators/debug/error-gen/Makefile
-		xlators/debug/error-gen/src/Makefile
-		xlators/debug/io-stats/Makefile
-		xlators/debug/io-stats/src/Makefile
-		xlators/protocol/Makefile
-		xlators/protocol/auth/Makefile
-		xlators/protocol/auth/addr/Makefile
-		xlators/protocol/auth/addr/src/Makefile
-		xlators/protocol/auth/login/Makefile
-		xlators/protocol/auth/login/src/Makefile
-		xlators/protocol/client/Makefile
-		xlators/protocol/client/src/Makefile
-		xlators/protocol/server/Makefile
-		xlators/protocol/server/src/Makefile
-		xlators/features/Makefile
-		xlators/features/locks/Makefile
-		xlators/features/locks/src/Makefile
-		xlators/features/trash/Makefile
-		xlators/features/trash/src/Makefile
-		xlators/features/quota/Makefile
-		xlators/features/quota/src/Makefile
+                xlators/Makefile
+                xlators/mount/Makefile
+                xlators/mount/fuse/Makefile
+                xlators/mount/fuse/src/Makefile
+                xlators/mount/fuse/utils/mount.glusterfs
+                xlators/mount/fuse/utils/mount_glusterfs
+                xlators/mount/fuse/utils/Makefile
+                xlators/storage/Makefile
+                xlators/storage/posix/Makefile
+                xlators/storage/posix/src/Makefile
+                xlators/storage/bd/Makefile
+                xlators/storage/bd/src/Makefile
+                xlators/cluster/Makefile
+                xlators/cluster/afr/Makefile
+                xlators/cluster/afr/src/Makefile
+                xlators/cluster/stripe/Makefile
+                xlators/cluster/stripe/src/Makefile
+                xlators/cluster/dht/Makefile
+                xlators/cluster/dht/src/Makefile
+                xlators/performance/Makefile
+                xlators/performance/write-behind/Makefile
+                xlators/performance/write-behind/src/Makefile
+                xlators/performance/read-ahead/Makefile
+                xlators/performance/read-ahead/src/Makefile
+                xlators/performance/readdir-ahead/Makefile
+                xlators/performance/readdir-ahead/src/Makefile
+                xlators/performance/io-threads/Makefile
+                xlators/performance/io-threads/src/Makefile
+                xlators/performance/io-cache/Makefile
+                xlators/performance/io-cache/src/Makefile
+                xlators/performance/symlink-cache/Makefile
+                xlators/performance/symlink-cache/src/Makefile
+                xlators/performance/quick-read/Makefile
+                xlators/performance/quick-read/src/Makefile
+                xlators/performance/open-behind/Makefile
+                xlators/performance/open-behind/src/Makefile
+                xlators/performance/md-cache/Makefile
+                xlators/performance/md-cache/src/Makefile
+                xlators/debug/Makefile
+                xlators/debug/trace/Makefile
+                xlators/debug/trace/src/Makefile
+                xlators/debug/error-gen/Makefile
+                xlators/debug/error-gen/src/Makefile
+                xlators/debug/io-stats/Makefile
+                xlators/debug/io-stats/src/Makefile
+                xlators/protocol/Makefile
+                xlators/protocol/auth/Makefile
+                xlators/protocol/auth/addr/Makefile
+                xlators/protocol/auth/addr/src/Makefile
+                xlators/protocol/auth/login/Makefile
+                xlators/protocol/auth/login/src/Makefile
+                xlators/protocol/client/Makefile
+                xlators/protocol/client/src/Makefile
+                xlators/protocol/server/Makefile
+                xlators/protocol/server/src/Makefile
+                xlators/features/Makefile
+		xlators/features/changelog/Makefile
+		xlators/features/changelog/src/Makefile
+		xlators/features/changelog/lib/Makefile
+		xlators/features/changelog/lib/src/Makefile
+                xlators/features/glupy/Makefile
+                xlators/features/glupy/src/Makefile
+                xlators/features/locks/Makefile
+                xlators/features/locks/src/Makefile
+                xlators/features/quota/Makefile
+                xlators/features/quota/src/Makefile
                 xlators/features/marker/Makefile
                 xlators/features/marker/src/Makefile
-                xlators/features/marker/utils/Makefile
-                xlators/features/marker/utils/src/Makefile
-                xlators/features/marker/utils/syncdaemon/Makefile
-		xlators/features/read-only/Makefile
-		xlators/features/read-only/src/Makefile
-		xlators/features/mac-compat/Makefile
-		xlators/features/mac-compat/src/Makefile
-		xlators/features/quiesce/Makefile
-		xlators/features/quiesce/src/Makefile
-		xlators/encryption/Makefile
-		xlators/encryption/rot-13/Makefile
-		xlators/encryption/rot-13/src/Makefile
+                xlators/features/read-only/Makefile
+                xlators/features/read-only/src/Makefile
+                xlators/features/compress/Makefile
+                xlators/features/compress/src/Makefile
+                xlators/features/mac-compat/Makefile
+                xlators/features/mac-compat/src/Makefile
+                xlators/features/quiesce/Makefile
+                xlators/features/quiesce/src/Makefile
+                xlators/features/index/Makefile
+                xlators/features/index/src/Makefile
+                xlators/features/protect/Makefile
+                xlators/features/protect/src/Makefile
+                xlators/features/gfid-access/Makefile
+                xlators/features/gfid-access/src/Makefile
+                xlators/playground/Makefile
+                xlators/playground/template/Makefile
+                xlators/playground/template/src/Makefile
+                xlators/encryption/Makefile
+                xlators/encryption/rot-13/Makefile
+                xlators/encryption/rot-13/src/Makefile
+                xlators/encryption/crypt/Makefile
+                xlators/encryption/crypt/src/Makefile
+		xlators/features/qemu-block/Makefile
+		xlators/features/qemu-block/src/Makefile
                 xlators/system/Makefile
                 xlators/system/posix-acl/Makefile
                 xlators/system/posix-acl/src/Makefile
-                cli/Makefile
-                cli/src/Makefile
-		doc/Makefile
-		doc/examples/Makefile
-		doc/hacker-guide/Makefile
-		extras/Makefile
-		extras/init.d/Makefile
-		extras/init.d/glusterd.plist
-                extras/init.d/glusterd-Debian
-                extras/init.d/glusterd-Redhat
-                extras/init.d/glusterd-SuSE
-		extras/benchmarking/Makefile
-		contrib/fuse-util/Makefile
                 xlators/nfs/Makefile
                 xlators/nfs/server/Makefile
                 xlators/nfs/server/src/Makefile
                 xlators/mgmt/Makefile
                 xlators/mgmt/glusterd/Makefile
                 xlators/mgmt/glusterd/src/Makefile
-		glusterfs.spec])
+                cli/Makefile
+                cli/src/Makefile
+                doc/Makefile
+                extras/Makefile
+                extras/init.d/Makefile
+                extras/init.d/glusterd.plist
+                extras/init.d/glusterd-Debian
+                extras/init.d/glusterd-Redhat
+                extras/init.d/glusterd-SuSE
+                extras/systemd/Makefile
+                extras/systemd/glusterd.service
+                extras/benchmarking/Makefile
+                extras/hook-scripts/Makefile
+                extras/ocf/Makefile
+                extras/ocf/glusterd
+                extras/ocf/volume
+                extras/LinuxRPM/Makefile
+                extras/geo-rep/Makefile
+                contrib/fuse-util/Makefile
+                contrib/uuid/uuid_types.h
+                glusterfs-api.pc
+                libgfchangelog.pc
+                api/Makefile
+                api/src/Makefile
+                api/examples/Makefile
+                api/examples/setup.py
+		geo-replication/Makefile
+		geo-replication/src/Makefile
+		geo-replication/syncdaemon/Makefile
+                glusterfs.spec])
 
 AC_CANONICAL_HOST
 
 AC_PROG_CC
+AC_DISABLE_STATIC
 AC_PROG_LIBTOOL
 
+AC_ARG_WITH(pkgconfigdir,
+            [  --with-pkgconfigdir=DIR      pkgconfig file in DIR @<:@LIBDIR/pkgconfig@:>@],
+            [pkgconfigdir=$withval],
+            [pkgconfigdir='${libdir}/pkgconfig'])
+AC_SUBST(pkgconfigdir)
+
 AC_ARG_WITH(mountutildir,
             [  --with-mountutildir=DIR mount helper utility in DIR @<:@/sbin@:>@],
             [mountutildir=$withval],
             [mountutildir='/sbin'])
 AC_SUBST(mountutildir)
 
+AC_ARG_WITH(systemddir,
+            [  --with-systemddir=DIR systemd service files in DIR @<:@/usr/lib/systemd/system@:>@],
+            [systemddir=$withval],
+            [systemddir='/usr/lib/systemd/system'])
+AC_SUBST(systemddir)
+
 AC_ARG_WITH(initdir,
             [  --with-initdir=DIR init.d scripts in DIR @<:@/etc/init.d@:>@],
             [initdir=$withval],
@@ -167,12 +211,35 @@ AC_ARG_WITH(launchddir,
             [launchddir='/Library/LaunchDaemons'])
 AC_SUBST(launchddir)
 
+AC_ARG_WITH([ocf],
+            [AS_HELP_STRING([--without-ocf], [build OCF-compliant cluster resource agents])],
+            ,
+            [OCF_SUBDIR='ocf'],
+            )
+AC_SUBST(OCF_SUBDIR)
+
 # LEX needs a check
 AC_PROG_LEX
 if test  "x${LEX}" != "xflex" -a "x${FLEX}" != "xlex"; then
    AC_MSG_ERROR([Flex or lex required to build glusterfs.])
 fi
 
+dnl
+dnl Word sizes...
+dnl
+AC_CHECK_SIZEOF(short)
+AC_CHECK_SIZEOF(int)
+AC_CHECK_SIZEOF(long)
+AC_CHECK_SIZEOF(long long)
+SIZEOF_SHORT=$ac_cv_sizeof_short
+SIZEOF_INT=$ac_cv_sizeof_int
+SIZEOF_LONG=$ac_cv_sizeof_long
+SIZEOF_LONG_LONG=$ac_cv_sizeof_long_long
+AC_SUBST(SIZEOF_SHORT)
+AC_SUBST(SIZEOF_INT)
+AC_SUBST(SIZEOF_LONG)
+AC_SUBST(SIZEOF_LONG_LONG)
+
 # YACC needs a check
 AC_PROG_YACC
 if test "x${YACC}" = "xbyacc" -o "x${YACC}" = "xyacc" -o "x${YACC}" = "x"; then
@@ -181,8 +248,10 @@ fi
 
 AC_CHECK_TOOL([LD],[ld])
 
+AC_CHECK_LIB([crypto], [MD5], , AC_MSG_ERROR([OpenSSL crypto library is required to build glusterfs]))
+
 AC_CHECK_LIB([pthread], [pthread_mutex_init], , AC_MSG_ERROR([Posix threads library is required to build glusterfs]))
-		 
+
 AC_CHECK_FUNC([dlopen], [has_dlopen=yes], AC_CHECK_LIB([dl], [dlopen], , AC_MSG_ERROR([Dynamic linking library required to build glusterfs])))
 
 
@@ -190,6 +259,10 @@ AC_CHECK_HEADERS([sys/xattr.h])
 
 AC_CHECK_HEADERS([sys/extattr.h])
 
+AC_CHECK_HEADERS([openssl/md5.h])
+
+AC_CHECK_HEADERS([linux/falloc.h])
+
 case $host_os in
   darwin*)
     if ! test "`/usr/bin/sw_vers | grep ProductVersion: | cut -f 2 | cut -d. -f2`" -ge 5; then
@@ -223,8 +296,8 @@ fi
 
 # FUSE section
 AC_ARG_ENABLE([fuse-client],
-	      AC_HELP_STRING([--disable-fuse-client],
-			     [Do not build the fuse client. NOTE: you cannot mount glusterfs without the client]))
+              AC_HELP_STRING([--disable-fuse-client],
+                             [Do not build the fuse client. NOTE: you cannot mount glusterfs without the client]))
 
 BUILD_FUSE_CLIENT=no
 if test "x$enable_fuse_client" != "xno"; then
@@ -232,60 +305,153 @@ if test "x$enable_fuse_client" != "xno"; then
    BUILD_FUSE_CLIENT="yes"
 fi
 
+AC_ARG_ENABLE([bd-xlator],
+              AC_HELP_STRING([--enable-bd-xlator], [Build BD xlator]))
+
+if test "x$enable_bd_xlator" != "xno"; then
+  AC_CHECK_LIB([lvm2app],
+               [lvm_init,lvm_lv_from_name],
+               [HAVE_BD_LIB="yes"],
+               [HAVE_BD_LIB="no"])
+
+if test "x$HAVE_BD_LIB" = "xyes"; then
+    # lvm_lv_from_name() has been made public with lvm2-2.02.79
+    AC_CHECK_DECLS(
+                  [lvm_lv_from_name],
+                  [NEED_LVM_LV_FROM_NAME_DECL="no"],
+                  [NEED_LVM_LV_FROM_NAME_DECL="yes"],
+                  [[#include <lvm2app.h>]])
+  fi
+fi
+
+if test "x$enable_bd_xlator" = "xyes" -a "x$HAVE_BD_LIB" = "xno"; then
+   echo "BD xlator requested but required lvm2 development library not found."
+   exit 1
+fi
+
+BUILD_BD_XLATOR=no
+if test "x${enable-bd-xlator}" != "xno" -a "x${HAVE_BD_LIB}" = "xyes"; then
+  BUILD_BD_XLATOR=yes
+  AC_DEFINE(HAVE_BD_XLATOR, 1, [define if lvm2app library found and bd xlator
+                            enabled])
+  if test "x$NEED_LVM_LV_FROM_NAME_DECL" = "xyes"; then
+    AC_DEFINE(NEED_LVM_LV_FROM_NAME_DECL, 1, [defined if lvm_lv_from_name()
+                  was not found in the lvm2app.h header, but can be linked])
+  fi
+fi
+
+AM_CONDITIONAL([ENABLE_BD_XLATOR], [test x$BUILD_BD_XLATOR = xyes])
+
+# start encryption/crypt section
+
+AC_CHECK_HEADERS([openssl/cmac.h], [have_cmac_h=yes], [have_cmac_h=no])
+
+AC_ARG_ENABLE([crypt-xlator],
+	AC_HELP_STRING([--enable-crypt-xlator], [Build crypt encryption xlator]))
+
+if test "x$enable_crypt_xlator" = "xyes" -a "x$have_cmac_h" = "xno"; then
+   echo "Encryption xlator requires OpenSSL with cmac.h"
+   exit 1
+fi
+
+BUILD_CRYPT_XLATOR=no
+if test "x$enable_crypt_xlator" != "xno" -a "x$have_cmac_h" = "xyes"; then
+   BUILD_CRYPT_XLATOR=yes
+   AC_DEFINE(HAVE_CRYPT_XLATOR, 1, [enable building crypt encryption xlator])
+fi
+
+AM_CONDITIONAL([ENABLE_CRYPT_XLATOR], [test x$BUILD_CRYPT_XLATOR = xyes])
+
 AC_SUBST(FUSE_CLIENT_SUBDIR)
 # end FUSE section
 
 
 # FUSERMOUNT section
 AC_ARG_ENABLE([fusermount],
-              AC_HELP_STRING([--enable-fusermount],
-                             [Build fusermount]))
+              AC_HELP_STRING([--disable-fusermount],
+                             [Use system's fusermount]))
 
-BUILD_FUSERMOUNT="no"
-if test "x$enable_fusermount" = "xyes"; then
-  FUSERMOUNT_SUBDIR="contrib/fuse-util"
-  BUILD_FUSERMOUNT="yes"
+BUILD_FUSERMOUNT="yes"
+if test "x$enable_fusermount" = "xno"; then
+  BUILD_FUSERMOUNT="no"
+else
   AC_DEFINE(GF_FUSERMOUNT, 1, [Use our own fusermount])
+  FUSERMOUNT_SUBDIR="contrib/fuse-util"
 fi
 
 AC_SUBST(FUSERMOUNT_SUBDIR)
 #end FUSERMOUNT section
 
+# QEMU_BLOCK section
+
+AC_ARG_ENABLE([qemu-block],
+                AC_HELP_STRING([--enable-qemu-block],
+                        [Build QEMU Block formats translator]))
+
+if test "x$enable_qemu_block" != "xno"; then
+  PKG_CHECK_MODULES([GLIB], [glib-2.0],
+                    [HAVE_GLIB_2="yes"],
+  		    [HAVE_GLIB_2="no"])
+fi
+
+if test "x$enable_qemu_block" = "xyes" -a "x$HAVE_GLIB_2" = "xno"; then
+   echo "QEMU Block formats translator requires libglib-2.0, but missing."
+   exit 1
+fi
+
+BUILD_QEMU_BLOCK=no
+if test "x${enable_qemu_block}" != "xno" -a "x${HAVE_GLIB_2}" = "xyes"; then
+  BUILD_QEMU_BLOCK=yes
+  AC_DEFINE(HAVE_QEMU_BLOCK, 1, [define if libglib-2.0 library found and QEMU
+                  	     Block translator enabled])
+fi
+
+AM_CONDITIONAL([ENABLE_QEMU_BLOCK], [test x$BUILD_QEMU_BLOCK = xyes])
+
+# end QEMU_BLOCK section
 
 # EPOLL section
 AC_ARG_ENABLE([epoll],
-	      AC_HELP_STRING([--disable-epoll],
-			     [Use poll instead of epoll.]))
+              AC_HELP_STRING([--disable-epoll],
+                             [Use poll instead of epoll.]))
 
 BUILD_EPOLL=no
 if test "x$enable_epoll" != "xno"; then
    AC_CHECK_HEADERS([sys/epoll.h],
                     [BUILD_EPOLL=yes],
-		    [BUILD_EPOLL=no])
+                    [BUILD_EPOLL=no])
 fi
 # end EPOLL section
 
 
 # IBVERBS section
 AC_ARG_ENABLE([ibverbs],
-	      AC_HELP_STRING([--disable-ibverbs],
-			     [Do not build the ibverbs transport]))
+              AC_HELP_STRING([--disable-ibverbs],
+                             [Do not build the ibverbs transport]))
 
 if test "x$enable_ibverbs" != "xno"; then
   AC_CHECK_LIB([ibverbs],
                [ibv_get_device_list],
-	       [HAVE_LIBIBVERBS="yes"],
-	       [HAVE_LIBIBVERBS="no"])
+               [HAVE_LIBIBVERBS="yes"],
+               [HAVE_LIBIBVERBS="no"])
+  AC_CHECK_LIB([rdmacm], [rdma_create_id], [HAVE_RDMACM="yes"], [HAVE_RDMACM="no"])
 fi
 
-if test "x$enable_ibverbs" = "xyes" -a "x$HAVE_LIBIBVERBS" = "xno"; then
-   echo "ibverbs requested but not found."
-   exit 1
+if test "x$enable_ibverbs" = "xyes"; then
+   if test "x$HAVE_LIBIBVERBS" = "xno"; then
+      echo "ibverbs-transport requested, but libibverbs is not present."
+      exit 1
+   fi
+
+   if test "x$HAVE_RDMACM" = "xno"; then
+      echo "ibverbs-transport requested, but librdmacm is not present."
+      exit 1
+   fi
 fi
 
 BUILD_RDMA=no
 BUILD_IBVERBS=no
-if test "x$enable_ibverbs" != "xno" -a "x$HAVE_LIBIBVERBS" = "xyes"; then
+if test "x$enable_ibverbs" != "xno" -a "x$HAVE_LIBIBVERBS" = "xyes" -a "x$HAVE_RDMACM" = "xyes"; then
   IBVERBS_SUBDIR=ib-verbs
   BUILD_IBVERBS=yes
   RDMA_SUBDIR=rdma
@@ -299,8 +465,8 @@ AC_SUBST(RDMA_SUBDIR)
 
 # SYNCDAEMON section
 AC_ARG_ENABLE([georeplication],
-	      AC_HELP_STRING([--disable-georeplication],
-			     [Do not install georeplication components]))
+              AC_HELP_STRING([--disable-georeplication],
+                             [Do not install georeplication components]))
 
 BUILD_SYNCDAEMON=no
 case $host_os in
@@ -312,12 +478,12 @@ case $host_os in
        ;;
      *)
 #disabling geo replication for non-linux platforms
-	enable_georeplication=no
+        enable_georeplication=no
         ;;
 esac
 SYNCDAEMON_COMPILE=0
 if test "x$enable_georeplication" != "xno"; then
-  SYNCDAEMON_SUBDIR=utils
+  SYNCDAEMON_SUBDIR=geo-replication
   SYNCDAEMON_COMPILE=1
 
   BUILD_SYNCDAEMON="yes"
@@ -341,15 +507,67 @@ AC_SUBST(SYNCDAEMON_COMPILE)
 AC_SUBST(SYNCDAEMON_SUBDIR)
 # end SYNCDAEMON section
 
-#check if libxml is present if so enable HAVE_LIB_XML
-echo -n "checking if libxml2 is present... "
+# CDC xlator - check if libz is present if so enable HAVE_LIB_Z
+echo -n "checking if libz is present... "
 
-PKG_CHECK_MODULES([LIBXML2], [libxml-2.0 >= 2.6.19],
-                  [echo "yes (features requiring libxml2 enabled)"  AC_DEFINE(HAVE_LIB_XML, 1, [define if libxml2 is present])],
+PKG_CHECK_MODULES([ZLIB], [zlib >= 1.2.0],
+                  [echo "yes (features requiring zlib enabled)"  AC_DEFINE(HAVE_LIB_Z, 1, [define if zlib is present])],
                   [echo "no"] )
 
-AC_SUBST(LIBXML2_CFLAGS)
-AC_SUBST(LIBXML2_LIBS)
+AC_SUBST(LIBZ_CFLAGS)
+AC_SUBST(LIBZ_LIBS)
+# end CDC xlator secion
+
+# check for systemtap/dtrace
+BUILD_SYSTEMTAP=no
+AC_MSG_CHECKING([whether to include systemtap tracing support])
+AC_ARG_ENABLE([systemtap],
+              [AS_HELP_STRING([--enable-systemtap],
+              [Enable inclusion of systemtap trace support])],
+              [ENABLE_SYSTEMTAP="${enableval}"], [ENABLE_SYSTEMTAP="def"])
+
+AM_CONDITIONAL([ENABLE_SYSTEMTAP], [test "x${ENABLE_SYSTEMTAP}" = "xyes"])
+AC_MSG_RESULT(${ENABLE_SYSTEMTAP})
+
+if test "x${ENABLE_SYSTEMTAP}" != "xno"; then
+    AC_CHECK_PROG(DTRACE, dtrace, "yes", "no")
+    AC_CHECK_HEADER([sys/sdt.h], [SDT_H_FOUND="yes"],
+                    [SDT_H_FOUND="no"])
+fi
+
+if test "x${ENABLE_SYSTEMTAP}" = "xyes"; then
+    if test "x${DTRACE}" = "xno"; then
+        AC_MSG_ERROR([dtrace not found])
+    elif test "$x{SDT_H_FOUND}" = "xno"; then
+        AC_MSG_ERROR([systemtap support needs sys/sdt.h header])
+    fi
+fi
+
+if test "x${DTRACE}" = "xyes" -a "x${SDT_H_FOUND}" = "xyes"; then
+    AC_MSG_CHECKING([x"${DTRACE}"xy"${SDT_H_FOUND}"y])
+    AC_DEFINE([HAVE_SYSTEMTAP], [1], [Define to 1 if using  probes.])
+    BUILD_SYSTEMTAP=yes
+fi
+# end of systemtap/dtrace
+
+# xml-output
+AC_ARG_ENABLE([xml-output],
+              AC_HELP_STRING([--disable-xml-output],
+                             [Disable the xml output]))
+BUILD_XML_OUTPUT="yes"
+if test "x$enable_xml_output" != "xno"; then
+    #check if libxml is present if so enable HAVE_LIB_XML
+    m4_ifdef([AM_PATH_XML2],[AM_PATH_XML2([2.6.19])], [no_xml=yes])
+    if test "x${no_xml}" = "x"; then
+        AC_DEFINE([HAVE_LIB_XML], [1], [Define to 1 if using libxml2.])
+    else
+        AC_MSG_WARN([libxml2 devel libraries not found disabling XML support])
+        BUILD_XML_OUTPUT="no"
+    fi
+else
+    BUILD_XML_OUTPUT="no"
+fi
+# end of xml-output
 
 dnl FreeBSD > 5 has execinfo as a Ported library for giving a workaround
 dnl solution to GCC backtrace functionality
@@ -374,17 +592,26 @@ dnl Linux, Solaris, Cygwin
 AC_CHECK_MEMBERS([struct stat.st_atim.tv_nsec])
 dnl FreeBSD, NetBSD
 AC_CHECK_MEMBERS([struct stat.st_atimespec.tv_nsec])
+case $host_os in
+        *netbsd*)
+        CFLAGS+=" -D_INCOMPLETE_XOPEN_C063"
+        ;;
+esac
 AC_CHECK_FUNC([linkat], [have_linkat=yes])
 if test "x${have_linkat}" = "xyes"; then
    AC_DEFINE(HAVE_LINKAT, 1, [define if found linkat])
 fi
 AC_SUBST(HAVE_LINKAT)
 
+dnl check for Monotonic clock
+AC_CHECK_FUNC([clock_gettime], [has_monotonic_clock=yes], AC_CHECK_LIB([rt], [clock_gettime], , AC_MSG_WARN([System doesn't have monotonic clock using contrib])))
+
 dnl Check for argp
 AC_CHECK_HEADER([argp.h], AC_DEFINE(HAVE_ARGP, 1, [have argp]))
 AC_CONFIG_SUBDIRS(argp-standalone)
+
 BUILD_ARGP_STANDALONE=no
-if test "x${ac_cv_header_argp_h}" = "xno"; then 
+if test "x${ac_cv_header_argp_h}" = "xno"; then
    BUILD_ARGP_STANDALONE=yes
    ARGP_STANDALONE_CPPFLAGS='-I${top_srcdir}/argp-standalone'
    ARGP_STANDALONE_LDADD='${top_builddir}/argp-standalone/libargp.a'
@@ -405,7 +632,18 @@ if test "x${have_fdatasync}" = "xyes"; then
    AC_DEFINE(HAVE_FDATASYNC, 1, [define if fdatasync exists])
 fi
 
-# Check the distribution where you are compiling glusterfs on 
+AC_CHECK_FUNC([fallocate], [have_fallocate=yes])
+if test "x${have_fallocate}" = "xyes"; then
+   AC_DEFINE(HAVE_FALLOCATE, 1, [define if fallocate exists])
+fi
+
+AC_CHECK_FUNC([posix_fallocate], [have_posix_fallocate=yes])
+if test "x${have_posix_fallocate}" = "xyes"; then
+   AC_DEFINE(HAVE_POSIX_FALLOCATE, 1, [define if posix_fallocate exists])
+fi
+
+
+# Check the distribution where you are compiling glusterfs on
 
 GF_DISTRIBUTION=
 AC_CHECK_FILE([/etc/debian_version])
@@ -427,56 +665,115 @@ AC_SUBST(GF_DISTRIBUTION)
 GF_HOST_OS=""
 GF_LDFLAGS="-rdynamic"
 
+# check for gcc -Werror=format-security
+saved_CFLAGS=$CFLAGS
+CFLAGS="-Wformat -Werror=format-security"
+AC_MSG_CHECKING([whether $CC accepts -Werror=format-security])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], [cc_werror_format_security=yes], [cc_werror_format_security=no])
+echo $cc_werror_format_security
+if test "x$cc_werror_format_security" = "xno"; then
+    CFLAGS="$saved_CFLAGS"
+else
+    CFLAGS="$saved_CFLAGS $CFLAGS"
+fi
+
+# check for gcc -Werror=implicit-function-declaration
+saved_CFLAGS=$CFLAGS
+CFLAGS="-Werror=implicit-function-declaration"
+AC_MSG_CHECKING([whether $CC accepts -Werror=implicit-function-declaration])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], [cc_werror_implicit=yes], [cc_werror_implicit=no])
+echo $cc_werror_implicit
+if test "x$cc_werror_implicit" = "xno"; then
+    CFLAGS="$saved_CFLAGS"
+else
+    CFLAGS="$saved_CFLAGS $CFLAGS"
+fi
+
 case $host_os in
      linux*)
-     dnl	GF_LINUX_HOST_OS=1
         GF_HOST_OS="GF_LINUX_HOST_OS"
-	GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -O0"
-	GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
-	GF_LDADD="${ARGP_STANDALONE_LDADD}"
-	GF_FUSE_CFLAGS="-DFUSERMOUNT_DIR=\\\"\$(bindir)\\\""
-	;;
+        GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -O0"
+        GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
+        GF_LDADD="${ARGP_STANDALONE_LDADD}"
+        GF_FUSE_CFLAGS="-DFUSERMOUNT_DIR=\\\"\$(bindir)\\\""
+        ;;
      solaris*)
         GF_HOST_OS="GF_SOLARIS_HOST_OS"
-	GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D_REENTRANT -D_POSIX_PTHREAD_SEMANTICS -O0 -m64"
-	GF_LDFLAGS=""
-	GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
-	GF_LDADD="${ARGP_STANDALONE_LDADD}"
-	GF_GLUSTERFS_LDFLAGS="-lnsl -lresolv -lsocket"
+        GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D_REENTRANT -D_POSIX_PTHREAD_SEMANTICS -O0 -m64"
+        GF_LDFLAGS=""
+        GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
+        GF_LDADD="${ARGP_STANDALONE_LDADD}"
+        GF_GLUSTERFS_LIBS="-lnsl -lresolv -lsocket"
         BUILD_FUSE_CLIENT=no
         FUSE_CLIENT_SUBDIR=""
-	;;
+        ;;
      *netbsd*)
-	GF_HOST_OS="GF_BSD_HOST_OS"
-	GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS}"
-	GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
-	GF_LDADD="${ARGP_STANDALONE_LDADD}"
-	if test "x$ac_cv_header_execinfo_h" = "xyes"; then
-	   GF_GLUSTERFS_LDFLAGS="-lexecinfo"
-	fi
-	GF_FUSE_LDADD="-liconv -lperfuse"
-	BUILD_FUSE_CLIENT=no
-	;;
+        GF_HOST_OS="GF_BSD_HOST_OS"
+        GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D_INCOMPLETE_XOPEN_C063"
+        GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_BASENAME"
+        GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_DIRNAME"
+        GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
+        GF_LDADD="${ARGP_STANDALONE_LDADD}"
+        if test "x$ac_cv_header_execinfo_h" = "xyes"; then
+           GF_GLUSTERFS_LIBS="-lexecinfo"
+        fi
+        GF_FUSE_LDADD="-lperfuse"
+        BUILD_FUSE_CLIENT=yes
+        LEXLIB=""
+        ;;
      *bsd*)
         GF_HOST_OS="GF_BSD_HOST_OS"
-	GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -O0"
-	GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
-	GF_LDADD="${ARGP_STANDALONE_LDADD}"
-	if test "x$ac_cv_header_execinfo_h" = "xyes"; then
-	   GF_GLUSTERFS_LDFLAGS="-lexecinfo"
-	fi				      
-	BUILD_FUSE_CLIENT=no
-	;;
+        GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -O0"
+        GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_BASENAME"
+        GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_DIRNAME"
+        GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}"
+        GF_LDADD="${ARGP_STANDALONE_LDADD}"
+        if test "x$ac_cv_header_execinfo_h" = "xyes"; then
+           GF_GLUSTERFS_LIBS="-lexecinfo"
+        fi
+        BUILD_FUSE_CLIENT=no
+        ;;
      darwin*)
         GF_HOST_OS="GF_DARWIN_HOST_OS"
-	LIBTOOL=glibtool
-	GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D__DARWIN_64_BIT_INO_T -bundle -undefined suppress -flat_namespace -D_XOPEN_SOURCE -O0"
-	GF_GLUSTERFS_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D__DARWIN_64_BIT_INO_T -undefined suppress -flat_namespace -O0"
-	GF_LDADD="${ARGP_STANDALONE_LDADD}"
-	GF_FUSE_CFLAGS="-I\$(CONTRIBDIR)/macfuse"
-	;;
+        LIBTOOL=glibtool
+        GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D__DARWIN_64_BIT_INO_T -bundle -undefined suppress -flat_namespace -D_XOPEN_SOURCE -O0"
+        GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_BASENAME"
+        GF_CFLAGS="${GF_CFLAGS} -DTHREAD_UNSAFE_DIRNAME"
+        GF_GLUSTERFS_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D__DARWIN_64_BIT_INO_T -undefined suppress -flat_namespace -O0"
+        GF_LDADD="${ARGP_STANDALONE_LDADD}"
+        GF_FUSE_CFLAGS="-I\$(CONTRIBDIR)/macfuse"
+        ;;
 esac
 
+# enable debug section
+AC_ARG_ENABLE([debug],
+              AC_HELP_STRING([--enable-debug],
+                             [Enable debug build options.]))
+
+BUILD_DEBUG=no
+if test "x$enable_debug" = "xyes"; then
+   BUILD_DEBUG=yes
+   CFLAGS=`echo $CFLAGS | sed -e s/O2/O0/`
+else
+   BUILD_DEBUG=no
+fi
+AC_SUBST(CFLAGS)
+# end enable debug section
+
+# syslog section
+AC_ARG_ENABLE([syslog],
+	      AC_HELP_STRING([--disable-syslog],
+			     [Disable syslog for logging]))
+
+USE_SYSLOG="yes"
+if test "x$enable_syslog" != "xno"; then
+  AC_DEFINE(GF_USE_SYSLOG, 1, [Use syslog for logging])
+else
+  USE_SYSLOG="no"
+fi
+AM_CONDITIONAL([ENABLE_SYSLOG], [test x$USE_SYSLOG = xyes])
+#end syslog section
+
 BUILD_READLINE=no
 AC_CHECK_LIB([readline -lcurses],[readline],[RLLIBS="-lreadline -lcurses"])
 AC_CHECK_LIB([readline -ltermcap],[readline],[RLLIBS="-lreadline -ltermcap"])
@@ -487,8 +784,66 @@ if test "x$RLLIBS" != "x"; then
    BUILD_READLINE=yes
 fi
 
+BUILD_LIBAIO=no
+AC_CHECK_LIB([aio],[io_setup],[LIBAIO="-laio"])
+
+if test "x$LIBAIO" != "x"; then
+   AC_DEFINE(HAVE_LIBAIO, 1, [libaio based POSIX enabled])
+   BUILD_LIBAIO=yes
+fi
+
+# glupy section
+BUILD_GLUPY=no
+have_python2=no
+have_Python_h=no
+
+AM_PATH_PYTHON()
+if echo $PYTHON_VERSION | grep ^2; then
+  have_python2=yes
+fi
+AC_CHECK_HEADERS([python$PYTHON_VERSION/Python.h],[have_Python_h=yes],[])
+AC_ARG_ENABLE([glupy],
+              AS_HELP_STRING([--enable-glupy],
+                             [build glupy]))
+case x$enable_glupy in
+   xyes)
+      if test "x$have_python2" = "xyes" -a "x$have_Python_h" = "xyes"; then
+         BUILD_GLUPY=yes
+      else
+         AC_MSG_ERROR([glupy requires python-devel/python-dev package and python2.x])
+      fi
+      ;;
+   xno)
+      ;;
+   *)
+      if test "x$have_python2" = "xyes" -a "x$have_Python_h" = "xyes"; then
+         BUILD_GLUPY=yes
+      else
+         AC_MSG_WARN([
+        ---------------------------------------------------------------------------------
+         cannot build glupy. python 2.x and python-devel/python-dev package are required.
+        ---------------------------------------------------------------------------------])
+      fi
+      ;;
+esac
+
+if test "x$BUILD_GLUPY" = "xyes"; then
+   BUILD_PYTHON_INC=`$PYTHON -c "from distutils import sysconfig; print sysconfig.get_python_inc()"`
+   BUILD_PYTHON_LIB=python$PYTHON_VERSION
+   GLUPY_SUBDIR=glupy
+   GLUPY_SUBDIR_MAKEFILE=xlators/features/glupy/Makefile
+   GLUPY_SUBDIR_SRC_MAKEFILE=xlators/features/glupy/src/Makefile
+   echo "building glupy with -isystem $BUILD_PYTHON_INC -l $BUILD_PYTHON_LIB"
+   AC_SUBST(BUILD_PYTHON_INC)
+   AC_SUBST(BUILD_PYTHON_LIB)
+   AC_SUBST(GLUPY_SUBDIR)
+   AC_SUBST(GLUPY_SUBDIR_MAKEFILE)
+   AC_SUBST(GLUPY_SUBDIR_SRC_MAKEFILE)
+fi
+# end glupy section
+
 AC_SUBST(GF_HOST_OS)
-AC_SUBST(GF_GLUSTERFS_LDFLAGS)
+AC_SUBST([GF_GLUSTERFS_LIBS])
 AC_SUBST(GF_GLUSTERFS_CFLAGS)
 AC_SUBST(GF_CFLAGS)
 AC_SUBST(GF_LDFLAGS)
@@ -496,27 +851,41 @@ AC_SUBST(GF_LDADD)
 AC_SUBST(GF_FUSE_LDADD)
 AC_SUBST(GF_FUSE_CFLAGS)
 AC_SUBST(RLLIBS)
+AC_SUBST(LIBAIO)
 AC_SUBST(AM_MAKEFLAGS)
 AC_SUBST(AM_LIBTOOLFLAGS)
 
 CONTRIBDIR='$(top_srcdir)/contrib'
 AC_SUBST(CONTRIBDIR)
 
-INCLUDES='-I$(top_srcdir)/libglusterfs/src -I$(CONTRIBDIR)/uuid'
-AC_SUBST(INCLUDES)
+GF_CPPDEFINES='-D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS)'
+GF_CPPINCLUDES='-I$(top_srcdir)/libglusterfs/src -I$(CONTRIBDIR)/uuid'
+GF_CPPFLAGS="$GF_CPPDEFINES $GF_CPPINCLUDES"
+AC_SUBST([GF_CPPFLAGS])
+
+AM_CONDITIONAL([GF_DARWIN_HOST_OS], test "${GF_HOST_OS}" = "GF_DARWIN_HOST_OS")
 
-AM_CONDITIONAL([GF_DARWIN_HOST_OS], test "${GF_HOST_OS}" = "GF_DARWIN_HOST_OS")	
+AM_CONDITIONAL([GF_INSTALL_VAR_LIB_GLUSTERD], test ! -d ${localstatedir}/lib/glusterd && test -d ${sysconfdir}/glusterd )
 
 AC_OUTPUT
 
 echo
 echo "GlusterFS configure summary"
 echo "==========================="
-echo "FUSE client        : $BUILD_FUSE_CLIENT"
-echo "Infiniband verbs   : $BUILD_IBVERBS"
-echo "epoll IO multiplex : $BUILD_EPOLL"
-echo "argp-standalone    : $BUILD_ARGP_STANDALONE"
-echo "fusermount         : $BUILD_FUSERMOUNT"
-echo "readline           : $BUILD_READLINE"
-echo "georeplication     : $BUILD_SYNCDAEMON"
+echo "FUSE client          : $BUILD_FUSE_CLIENT"
+echo "Infiniband verbs     : $BUILD_IBVERBS"
+echo "epoll IO multiplex   : $BUILD_EPOLL"
+echo "argp-standalone      : $BUILD_ARGP_STANDALONE"
+echo "fusermount           : $BUILD_FUSERMOUNT"
+echo "readline             : $BUILD_READLINE"
+echo "georeplication       : $BUILD_SYNCDAEMON"
+echo "Linux-AIO            : $BUILD_LIBAIO"
+echo "Enable Debug         : $BUILD_DEBUG"
+echo "systemtap            : $BUILD_SYSTEMTAP"
+echo "Block Device xlator  : $BUILD_BD_XLATOR"
+echo "glupy                : $BUILD_GLUPY"
+echo "Use syslog           : $USE_SYSLOG"
+echo "XML output           : $BUILD_XML_OUTPUT"
+echo "QEMU Block formats   : $BUILD_QEMU_BLOCK"
+echo "Encryption xlator    : $BUILD_CRYPT_XLATOR"
 echo
diff --git a/contrib/aclocal/mkdirp.m4 b/contrib/aclocal/mkdirp.m4
new file mode 100644
index 000000000..d2f7edd5c
--- /dev/null
+++ b/contrib/aclocal/mkdirp.m4
@@ -0,0 +1,146 @@
+# Excerpt from autoconf/autoconf/programs.m4
+# This file is part of Autoconf.                       -*- Autoconf -*-
+# Checking for programs.
+
+# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
+# 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free Software
+# Foundation, Inc.
+
+# This file is part of Autoconf.  This program is free
+# software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# Under Section 7 of GPL version 3, you are granted additional
+# permissions described in the Autoconf Configure Script Exception,
+# version 3.0, as published by the Free Software Foundation.
+#
+# You should have received a copy of the GNU General Public License
+# and a copy of the Autoconf Configure Script Exception along with
+# this program; see the files COPYINGv3 and COPYING.EXCEPTION
+# respectively.  If not, see <http://www.gnu.org/licenses/>.
+
+# Written by David MacKenzie, with help from
+# Franc,ois Pinard, Karl Berry, Richard Pixley, Ian Lance Taylor,
+# Roland McGrath, Noah Friedman, david d zuhn, and many others.
+
+# AC_PROG_MKDIR_P
+# ---------------
+# Check whether `mkdir -p' is known to be thread-safe, and fall back to
+# install-sh -d otherwise.
+#
+# Automake 1.8 used `mkdir -m 0755 -p --' to ensure that directories
+# created by `make install' are always world readable, even if the
+# installer happens to have an overly restrictive umask (e.g. 077).
+# This was a mistake.  There are at least two reasons why we must not
+# use `-m 0755':
+#   - it causes special bits like SGID to be ignored,
+#   - it may be too restrictive (some setups expect 775 directories).
+#
+# Do not use -m 0755 and let people choose whatever they expect by
+# setting umask.
+#
+# We cannot accept any implementation of `mkdir' that recognizes `-p'.
+# Some implementations (such as Solaris 8's) are vulnerable to race conditions:
+# if a parallel make tries to run `mkdir -p a/b' and `mkdir -p a/c'
+# concurrently, both version can detect that a/ is missing, but only
+# one can create it and the other will error out.  Consequently we
+# restrict ourselves to known race-free implementations.
+#
+# Automake used to define mkdir_p as `mkdir -p .', in order to
+# allow $(mkdir_p) to be used without argument.  As in
+#   $(mkdir_p) $(somedir)
+# where $(somedir) is conditionally defined.  However we don't do
+# that for MKDIR_P.
+#  1. before we restricted the check to GNU mkdir, `mkdir -p .' was
+#     reported to fail in read-only directories.  The system where this
+#     happened has been forgotten.
+#  2. in practice we call $(MKDIR_P) on directories such as
+#       $(MKDIR_P) "$(DESTDIR)$(somedir)"
+#     and we don't want to create $(DESTDIR) if $(somedir) is empty.
+#     To support the latter case, we have to write
+#       test -z "$(somedir)" || $(MKDIR_P) "$(DESTDIR)$(somedir)"
+#     so $(MKDIR_P) always has an argument.
+#     We will have better chances of detecting a missing test if
+#     $(MKDIR_P) complains about missing arguments.
+#  3. $(MKDIR_P) is named after `mkdir -p' and we don't expect this
+#     to accept no argument.
+#  4. having something like `mkdir .' in the output is unsightly.
+#
+# On NextStep and OpenStep, the `mkdir' command does not
+# recognize any option.  It will interpret all options as
+# directories to create.
+AN_MAKEVAR([MKDIR_P], [AC_PROG_MKDIR_P])
+AC_DEFUN_ONCE([AC_PROG_MKDIR_P],
+[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
+
+AC_MSG_CHECKING([for a thread-safe mkdir -p])
+if test -z "$MKDIR_P"; then
+  AC_CACHE_VAL([ac_cv_path_mkdir],
+    [_AS_PATH_WALK([$PATH$PATH_SEPARATOR/opt/sfw/bin],
+      [for ac_prog in mkdir gmkdir; do
+	 for ac_exec_ext in '' $ac_executable_extensions; do
+	   AS_EXECUTABLE_P(["$as_dir/$ac_prog$ac_exec_ext"]) || continue
+	   case `"$as_dir/$ac_prog$ac_exec_ext" --version 2>&1` in #(
+	     'mkdir (GNU coreutils) '* | \
+	     'mkdir (coreutils) '* | \
+	     'mkdir (fileutils) '4.1*)
+	       ac_cv_path_mkdir=$as_dir/$ac_prog$ac_exec_ext
+	       break 3;;
+	   esac
+	 done
+       done])])
+  test -d ./--version && rmdir ./--version
+  if test "${ac_cv_path_mkdir+set}" = set; then
+    MKDIR_P="$ac_cv_path_mkdir -p"
+  else
+    # As a last resort, use the slow shell script.  Don't cache a
+    # value for MKDIR_P within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the value is a relative name.
+    MKDIR_P="$ac_install_sh -d"
+  fi
+fi
+dnl status.m4 does special magic for MKDIR_P instead of AC_SUBST,
+dnl to get relative names right.  However, also AC_SUBST here so
+dnl that Automake versions before 1.10 will pick it up (they do not
+dnl trace AC_SUBST_TRACE).
+dnl FIXME: Remove this once we drop support for Automake < 1.10.
+AC_SUBST([MKDIR_P])dnl
+AC_MSG_RESULT([$MKDIR_P])
+])# AC_PROG_MKDIR_P
+
+
+# From automake/m4/mkdirp.m4
+##                                                          -*- Autoconf -*-
+# Copyright (C) 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_PROG_MKDIR_P
+# ---------------
+# Check for `mkdir -p'.
+AC_DEFUN([AM_PROG_MKDIR_P],
+[
+AC_REQUIRE([AC_PROG_MKDIR_P])dnl
+dnl Automake 1.8 to 1.9.6 used to define mkdir_p.  We now use MKDIR_P,
+dnl while keeping a definition of mkdir_p for backward compatibility.
+dnl @MKDIR_P@ is magic: AC_OUTPUT adjusts its value for each Makefile.
+dnl However we cannot define mkdir_p as $(MKDIR_P) for the sake of
+dnl Makefile.ins that do not define MKDIR_P, so we do our own
+dnl adjustment using top_builddir (which is defined more often than
+dnl MKDIR_P).
+AC_SUBST([mkdir_p], ["$MKDIR_P"])dnl
+case $mkdir_p in
+  [[\\/$]]* | ?:[[\\/]]*) ;;
+  */*) mkdir_p="\$(top_builddir)/$mkdir_p" ;;
+esac
+])
diff --git a/contrib/aclocal/python.m4 b/contrib/aclocal/python.m4
new file mode 100644
index 000000000..a39a90090
--- /dev/null
+++ b/contrib/aclocal/python.m4
@@ -0,0 +1,209 @@
+## ------------------------                                 -*- Autoconf -*-
+## Python file handling
+## From Andrew Dalke
+## Updated by James Henstridge
+## ------------------------
+# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_PATH_PYTHON([MINIMUM-VERSION], [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+# ---------------------------------------------------------------------------
+# Adds support for distributing Python modules and packages.  To
+# install modules, copy them to $(pythondir), using the python_PYTHON
+# automake variable.  To install a package with the same name as the
+# automake package, install to $(pkgpythondir), or use the
+# pkgpython_PYTHON automake variable.
+#
+# The variables $(pyexecdir) and $(pkgpyexecdir) are provided as
+# locations to install python extension modules (shared libraries).
+# Another macro is required to find the appropriate flags to compile
+# extension modules.
+#
+# If your package is configured with a different prefix to python,
+# users will have to add the install directory to the PYTHONPATH
+# environment variable, or create a .pth file (see the python
+# documentation for details).
+#
+# If the MINIMUM-VERSION argument is passed, AM_PATH_PYTHON will
+# cause an error if the version of python installed on the system
+# doesn't meet the requirement.  MINIMUM-VERSION should consist of
+# numbers and dots only.
+AC_DEFUN([AM_PATH_PYTHON],
+ [
+  dnl Find a Python interpreter.  Python versions prior to 2.0 are not
+  dnl supported. (2.0 was released on October 16, 2000).
+  m4_define_default([_AM_PYTHON_INTERPRETER_LIST],
+                    [python python2 python3 python3.2 python3.1 python3.0 python2.7 python2.6 python2.5 python2.4 python2.3 python2.2 dnl
+python2.1 python2.0])
+
+  m4_if([$1],[],[
+    dnl No version check is needed.
+    # Find any Python interpreter.
+    if test -z "$PYTHON"; then
+      AC_PATH_PROGS([PYTHON], _AM_PYTHON_INTERPRETER_LIST, :)
+    fi
+    am_display_PYTHON=python
+  ], [
+    dnl A version check is needed.
+    if test -n "$PYTHON"; then
+      # If the user set $PYTHON, use it and don't search something else.
+      AC_MSG_CHECKING([whether $PYTHON version >= $1])
+      AM_PYTHON_CHECK_VERSION([$PYTHON], [$1],
+			      [AC_MSG_RESULT(yes)],
+			      [AC_MSG_ERROR(too old)])
+      am_display_PYTHON=$PYTHON
+    else
+      # Otherwise, try each interpreter until we find one that satisfies
+      # VERSION.
+      AC_CACHE_CHECK([for a Python interpreter with version >= $1],
+	[am_cv_pathless_PYTHON],[
+	for am_cv_pathless_PYTHON in _AM_PYTHON_INTERPRETER_LIST none; do
+	  test "$am_cv_pathless_PYTHON" = none && break
+	  AM_PYTHON_CHECK_VERSION([$am_cv_pathless_PYTHON], [$1], [break])
+	done])
+      # Set $PYTHON to the absolute path of $am_cv_pathless_PYTHON.
+      if test "$am_cv_pathless_PYTHON" = none; then
+	PYTHON=:
+      else
+        AC_PATH_PROG([PYTHON], [$am_cv_pathless_PYTHON])
+      fi
+      am_display_PYTHON=$am_cv_pathless_PYTHON
+    fi
+  ])
+
+  if test "$PYTHON" = :; then
+  dnl Run any user-specified action, or abort.
+    m4_default([$3], [AC_MSG_ERROR([no suitable Python interpreter found])])
+  else
+
+  dnl Query Python for its version number.  Getting [:3] seems to be
+  dnl the best way to do this; it's what "site.py" does in the standard
+  dnl library.
+
+  AC_CACHE_CHECK([for $am_display_PYTHON version], [am_cv_python_version],
+    [am_cv_python_version=`$PYTHON -c "import sys; sys.stdout.write(sys.version[[:3]])"`])
+  AC_SUBST([PYTHON_VERSION], [$am_cv_python_version])
+
+  dnl Use the values of $prefix and $exec_prefix for the corresponding
+  dnl values of PYTHON_PREFIX and PYTHON_EXEC_PREFIX.  These are made
+  dnl distinct variables so they can be overridden if need be.  However,
+  dnl general consensus is that you shouldn't need this ability.
+
+  AC_SUBST([PYTHON_PREFIX], ['${prefix}'])
+  AC_SUBST([PYTHON_EXEC_PREFIX], ['${exec_prefix}'])
+
+  dnl At times (like when building shared libraries) you may want
+  dnl to know which OS platform Python thinks this is.
+
+  AC_CACHE_CHECK([for $am_display_PYTHON platform], [am_cv_python_platform],
+    [am_cv_python_platform=`$PYTHON -c "import sys; sys.stdout.write(sys.platform)"`])
+  AC_SUBST([PYTHON_PLATFORM], [$am_cv_python_platform])
+
+
+  dnl Set up 4 directories:
+
+  dnl pythondir -- where to install python scripts.  This is the
+  dnl   site-packages directory, not the python standard library
+  dnl   directory like in previous automake betas.  This behavior
+  dnl   is more consistent with lispdir.m4 for example.
+  dnl Query distutils for this directory.  distutils does not exist in
+  dnl Python 1.5, so we fall back to the hardcoded directory if it
+  dnl doesn't work.
+  AC_CACHE_CHECK([for $am_display_PYTHON script directory],
+    [am_cv_python_pythondir],
+    [if test "x$prefix" = xNONE
+     then
+       am_py_prefix=$ac_default_prefix
+     else
+       am_py_prefix=$prefix
+     fi
+     am_cv_python_pythondir=`$PYTHON -c "import sys; from distutils import sysconfig; sys.stdout.write(sysconfig.get_python_lib(0,0,prefix='$am_py_prefix'))" 2>/dev/null ||
+     echo "$PYTHON_PREFIX/lib/python$PYTHON_VERSION/site-packages"`
+     case $am_cv_python_pythondir in
+     $am_py_prefix*)
+       am__strip_prefix=`echo "$am_py_prefix" | sed 's|.|.|g'`
+       am_cv_python_pythondir=`echo "$am_cv_python_pythondir" | sed "s,^$am__strip_prefix,$PYTHON_PREFIX,"`
+       ;;
+     *)
+       case $am_py_prefix in
+         /usr|/System*) ;;
+         *)
+	  am_cv_python_pythondir=$PYTHON_PREFIX/lib/python$PYTHON_VERSION/site-packages
+	  ;;
+       esac
+       ;;
+     esac
+    ])
+  AC_SUBST([pythondir], [$am_cv_python_pythondir])
+
+  dnl pkgpythondir -- $PACKAGE directory under pythondir.  Was
+  dnl   PYTHON_SITE_PACKAGE in previous betas, but this naming is
+  dnl   more consistent with the rest of automake.
+
+  AC_SUBST([pkgpythondir], [\${pythondir}/$PACKAGE])
+
+  dnl pyexecdir -- directory for installing python extension modules
+  dnl   (shared libraries)
+  dnl Query distutils for this directory.  distutils does not exist in
+  dnl Python 1.5, so we fall back to the hardcoded directory if it
+  dnl doesn't work.
+  AC_CACHE_CHECK([for $am_display_PYTHON extension module directory],
+    [am_cv_python_pyexecdir],
+    [if test "x$exec_prefix" = xNONE
+     then
+       am_py_exec_prefix=$am_py_prefix
+     else
+       am_py_exec_prefix=$exec_prefix
+     fi
+     am_cv_python_pyexecdir=`$PYTHON -c "import sys; from distutils import sysconfig; sys.stdout.write(sysconfig.get_python_lib(1,0,prefix='$am_py_exec_prefix'))" 2>/dev/null ||
+     echo "$PYTHON_EXEC_PREFIX/lib/python$PYTHON_VERSION/site-packages"`
+     case $am_cv_python_pyexecdir in
+     $am_py_exec_prefix*)
+       am__strip_prefix=`echo "$am_py_exec_prefix" | sed 's|.|.|g'`
+       am_cv_python_pyexecdir=`echo "$am_cv_python_pyexecdir" | sed "s,^$am__strip_prefix,$PYTHON_EXEC_PREFIX,"`
+       ;;
+     *)
+       case $am_py_exec_prefix in
+         /usr|/System*) ;;
+         *)
+	   am_cv_python_pyexecdir=$PYTHON_EXEC_PREFIX/lib/python$PYTHON_VERSION/site-packages
+	   ;;
+       esac
+       ;;
+     esac
+    ])
+  AC_SUBST([pyexecdir], [$am_cv_python_pyexecdir])
+
+  dnl pkgpyexecdir -- $(pyexecdir)/$(PACKAGE)
+
+  AC_SUBST([pkgpyexecdir], [\${pyexecdir}/$PACKAGE])
+
+  dnl Run any user-specified action.
+  $2
+  fi
+
+])
+
+
+# AM_PYTHON_CHECK_VERSION(PROG, VERSION, [ACTION-IF-TRUE], [ACTION-IF-FALSE])
+# ---------------------------------------------------------------------------
+# Run ACTION-IF-TRUE if the Python interpreter PROG has version >= VERSION.
+# Run ACTION-IF-FALSE otherwise.
+# This test uses sys.hexversion instead of the string equivalent (first
+# word of sys.version), in order to cope with versions such as 2.2c1.
+# This supports Python 2.0 or higher. (2.0 was released on October 16, 2000).
+AC_DEFUN([AM_PYTHON_CHECK_VERSION],
+ [prog="import sys
+# split strings by '.' and convert to numeric.  Append some zeros
+# because we need at least 4 digits for the hex conversion.
+# map returns an iterator in Python 3.0 and a list in 2.x
+minver = list(map(int, '$2'.split('.'))) + [[0, 0, 0]]
+minverhex = 0
+# xrange is not present in Python 3.0 and range returns an iterator
+for i in list(range(0, 4)): minverhex = (minverhex << 8) + minver[[i]]
+sys.exit(sys.hexversion < minverhex)"
+  AS_IF([AM_RUN_LOG([$1 -c "$prog"])], [$3], [$4])])
diff --git a/contrib/fuse-include/fuse-mount.h b/contrib/fuse-include/fuse-mount.h
index 9f83faf02..9358ac810 100644
--- a/contrib/fuse-include/fuse-mount.h
+++ b/contrib/fuse-include/fuse-mount.h
@@ -8,5 +8,6 @@
 */
 
 void gf_fuse_unmount (const char *mountpoint, int fd);
-int gf_fuse_mount (const char *mountpoint, char *fsname, char *mnt_param,
-                   pid_t *mtab_pid);
+int gf_fuse_mount (const char *mountpoint, char *fsname,
+                   unsigned long mountflags, char *mnt_param,
+                   pid_t *mtab_pid, int status_fd);
diff --git a/contrib/fuse-include/fuse_kernel.h b/contrib/fuse-include/fuse_kernel.h
index 9ae25d6f9..60bb2f9f7 100644
--- a/contrib/fuse-include/fuse_kernel.h
+++ b/contrib/fuse-include/fuse_kernel.h
@@ -60,23 +60,75 @@
  * 7.13
  *  - make max number of background requests and congestion threshold
  *    tunables
+ *
+ * 7.14
+ *  - add splice support to fuse device
+ *
+ * 7.15
+ *  - add store notify
+ *  - add retrieve notify
+ *
+ * 7.16
+ *  - add BATCH_FORGET request
+ *  - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct
+ *    fuse_ioctl_iovec' instead of ambiguous 'struct iovec'
+ *  - add FUSE_IOCTL_32BIT flag
+ *
+ * 7.17
+ *  - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK
+ *
+ * 7.18
+ *  - add FUSE_IOCTL_DIR flag
+ *  - add FUSE_NOTIFY_DELETE
+ *
+ * 7.19
+ *  - add FUSE_FALLOCATE
+ *
+ * 7.20
+ *  - add FUSE_AUTO_INVAL_DATA
+ *
+ * 7.21
+ *  - add FUSE_READDIRPLUS
+ *  - send the requested events in POLL request
+ *
+ * 7.22
+ *  - add FUSE_ASYNC_DIO
  */
 
 #ifndef _LINUX_FUSE_H
 #define _LINUX_FUSE_H
 
-#include <sys/types.h>
-#define __u64 uint64_t
-#define __s64 int64_t
-#define __u32 uint32_t
-#define __s32 int32_t
-#define __u16 uint16_t
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+/*
+ * Version negotiation:
+ *
+ * Both the kernel and userspace send the version they support in the
+ * INIT request and reply respectively.
+ *
+ * If the major versions match then both shall use the smallest
+ * of the two minor versions for communication.
+ *
+ * If the kernel supports a larger major version, then userspace shall
+ * reply with the major version it supports, ignore the rest of the
+ * INIT message and expect a new INIT message from the kernel with a
+ * matching major version.
+ *
+ * If the library supports a larger major version, then it shall fall
+ * back to the major protocol version sent by the kernel for
+ * communication and reply with that major version (and an arbitrary
+ * supported minor version).
+ */
 
 /** Version number of this interface */
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 13
+#define FUSE_KERNEL_MINOR_VERSION 22
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -85,42 +137,42 @@
    userspace works under 64bit kernels */
 
 struct fuse_attr {
-	__u64	ino;
-	__u64	size;
-	__u64	blocks;
-	__u64	atime;
-	__u64	mtime;
-	__u64	ctime;
-	__u32	atimensec;
-	__u32	mtimensec;
-	__u32	ctimensec;
-	__u32	mode;
-	__u32	nlink;
-	__u32	uid;
-	__u32	gid;
-	__u32	rdev;
-	__u32	blksize;
-	__u32	padding;
+	uint64_t	ino;
+	uint64_t	size;
+	uint64_t	blocks;
+	uint64_t	atime;
+	uint64_t	mtime;
+	uint64_t	ctime;
+	uint32_t	atimensec;
+	uint32_t	mtimensec;
+	uint32_t	ctimensec;
+	uint32_t	mode;
+	uint32_t	nlink;
+	uint32_t	uid;
+	uint32_t	gid;
+	uint32_t	rdev;
+	uint32_t	blksize;
+	uint32_t	padding;
 };
 
 struct fuse_kstatfs {
-	__u64	blocks;
-	__u64	bfree;
-	__u64	bavail;
-	__u64	files;
-	__u64	ffree;
-	__u32	bsize;
-	__u32	namelen;
-	__u32	frsize;
-	__u32	padding;
-	__u32	spare[6];
+	uint64_t	blocks;
+	uint64_t	bfree;
+	uint64_t	bavail;
+	uint64_t	files;
+	uint64_t	ffree;
+	uint32_t	bsize;
+	uint32_t	namelen;
+	uint32_t	frsize;
+	uint32_t	padding;
+	uint32_t	spare[6];
 };
 
 struct fuse_file_lock {
-	__u64	start;
-	__u64	end;
-	__u32	type;
-	__u32	pid; /* tgid */
+	uint64_t	start;
+	uint64_t	end;
+	uint32_t	type;
+	uint32_t	pid; /* tgid */
 };
 
 /**
@@ -151,8 +203,22 @@ struct fuse_file_lock {
 /**
  * INIT request/reply flags
  *
+ * FUSE_ASYNC_READ: asynchronous read requests
+ * FUSE_POSIX_LOCKS: remote locking for POSIX file locks
+ * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported)
+ * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem
  * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".."
+ * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB
  * FUSE_DONT_MASK: don't apply umask to file mode on create operations
+ * FUSE_SPLICE_WRITE: kernel supports splice write on the device
+ * FUSE_SPLICE_MOVE: kernel supports splice move on the device
+ * FUSE_SPLICE_READ: kernel supports splice read on the device
+ * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks
+ * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories
+ * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages
+ * FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one)
+ * FUSE_READDIRPLUS_AUTO: adaptive readdirplus
+ * FUSE_ASYNC_DIO: asynchronous direct I/O submission
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -161,6 +227,15 @@ struct fuse_file_lock {
 #define FUSE_EXPORT_SUPPORT	(1 << 4)
 #define FUSE_BIG_WRITES		(1 << 5)
 #define FUSE_DONT_MASK		(1 << 6)
+#define FUSE_SPLICE_WRITE	(1 << 7)
+#define FUSE_SPLICE_MOVE	(1 << 8)
+#define FUSE_SPLICE_READ	(1 << 9)
+#define FUSE_FLOCK_LOCKS	(1 << 10)
+#define FUSE_HAS_IOCTL_DIR	(1 << 11)
+#define FUSE_AUTO_INVAL_DATA	(1 << 12)
+#define FUSE_DO_READDIRPLUS	(1 << 13)
+#define FUSE_READDIRPLUS_AUTO	(1 << 14)
+#define FUSE_ASYNC_DIO		(1 << 15)
 
 /**
  * CUSE INIT request/reply flags
@@ -173,6 +248,7 @@ struct fuse_file_lock {
  * Release flags
  */
 #define FUSE_RELEASE_FLUSH	(1 << 0)
+#define FUSE_RELEASE_FLOCK_UNLOCK	(1 << 1)
 
 /**
  * Getattr flags
@@ -204,12 +280,16 @@ struct fuse_file_lock {
  * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
  * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed
  * FUSE_IOCTL_RETRY: retry with new iovecs
+ * FUSE_IOCTL_32BIT: 32bit ioctl
+ * FUSE_IOCTL_DIR: is a directory
  *
  * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs
  */
 #define FUSE_IOCTL_COMPAT	(1 << 0)
 #define FUSE_IOCTL_UNRESTRICTED	(1 << 1)
 #define FUSE_IOCTL_RETRY	(1 << 2)
+#define FUSE_IOCTL_32BIT	(1 << 3)
+#define FUSE_IOCTL_DIR		(1 << 4)
 
 #define FUSE_IOCTL_MAX_IOV	256
 
@@ -259,6 +339,10 @@ enum fuse_opcode {
 	FUSE_DESTROY       = 38,
 	FUSE_IOCTL         = 39,
 	FUSE_POLL          = 40,
+	FUSE_NOTIFY_REPLY  = 41,
+	FUSE_BATCH_FORGET  = 42,
+	FUSE_FALLOCATE     = 43,
+	FUSE_READDIRPLUS   = 44,
 
 	/* CUSE specific operations */
 	CUSE_INIT          = 4096,
@@ -268,6 +352,9 @@ enum fuse_notify_code {
 	FUSE_NOTIFY_POLL   = 1,
 	FUSE_NOTIFY_INVAL_INODE = 2,
 	FUSE_NOTIFY_INVAL_ENTRY = 3,
+	FUSE_NOTIFY_STORE = 4,
+	FUSE_NOTIFY_RETRIEVE = 5,
+	FUSE_NOTIFY_DELETE = 6,
 	FUSE_NOTIFY_CODE_MAX,
 };
 
@@ -277,133 +364,143 @@ enum fuse_notify_code {
 #define FUSE_COMPAT_ENTRY_OUT_SIZE 120
 
 struct fuse_entry_out {
-	__u64	nodeid;		/* Inode ID */
-	__u64	generation;	/* Inode generation: nodeid:gen must
-				   be unique for the fs's lifetime */
-	__u64	entry_valid;	/* Cache timeout for the name */
-	__u64	attr_valid;	/* Cache timeout for the attributes */
-	__u32	entry_valid_nsec;
-	__u32	attr_valid_nsec;
+	uint64_t	nodeid;		/* Inode ID */
+	uint64_t	generation;	/* Inode generation: nodeid:gen must
+					   be unique for the fs's lifetime */
+	uint64_t	entry_valid;	/* Cache timeout for the name */
+	uint64_t	attr_valid;	/* Cache timeout for the attributes */
+	uint32_t	entry_valid_nsec;
+	uint32_t	attr_valid_nsec;
 	struct fuse_attr attr;
 };
 
 struct fuse_forget_in {
-	__u64	nlookup;
+	uint64_t	nlookup;
+};
+
+struct fuse_forget_one {
+	uint64_t	nodeid;
+	uint64_t	nlookup;
+};
+
+struct fuse_batch_forget_in {
+	uint32_t	count;
+	uint32_t	dummy;
 };
 
 struct fuse_getattr_in {
-	__u32	getattr_flags;
-	__u32	dummy;
-	__u64	fh;
+	uint32_t	getattr_flags;
+	uint32_t	dummy;
+	uint64_t	fh;
 };
 
 #define FUSE_COMPAT_ATTR_OUT_SIZE 96
 
 struct fuse_attr_out {
-	__u64	attr_valid;	/* Cache timeout for the attributes */
-	__u32	attr_valid_nsec;
-	__u32	dummy;
+	uint64_t	attr_valid;	/* Cache timeout for the attributes */
+	uint32_t	attr_valid_nsec;
+	uint32_t	dummy;
 	struct fuse_attr attr;
 };
 
 #define FUSE_COMPAT_MKNOD_IN_SIZE 8
 
 struct fuse_mknod_in {
-	__u32	mode;
-	__u32	rdev;
-	__u32	umask;
-	__u32	padding;
+	uint32_t	mode;
+	uint32_t	rdev;
+	uint32_t	umask;
+	uint32_t	padding;
 };
 
 struct fuse_mkdir_in {
-	__u32	mode;
-	__u32	umask;
+	uint32_t	mode;
+	uint32_t	umask;
 };
 
 struct fuse_rename_in {
-	__u64	newdir;
+	uint64_t	newdir;
 };
 
 struct fuse_link_in {
-	__u64	oldnodeid;
+	uint64_t	oldnodeid;
 };
 
 struct fuse_setattr_in {
-	__u32	valid;
-	__u32	padding;
-	__u64	fh;
-	__u64	size;
-	__u64	lock_owner;
-	__u64	atime;
-	__u64	mtime;
-	__u64	unused2;
-	__u32	atimensec;
-	__u32	mtimensec;
-	__u32	unused3;
-	__u32	mode;
-	__u32	unused4;
-	__u32	uid;
-	__u32	gid;
-	__u32	unused5;
+	uint32_t	valid;
+	uint32_t	padding;
+	uint64_t	fh;
+	uint64_t	size;
+	uint64_t	lock_owner;
+	uint64_t	atime;
+	uint64_t	mtime;
+	uint64_t	unused2;
+	uint32_t	atimensec;
+	uint32_t	mtimensec;
+	uint32_t	unused3;
+	uint32_t	mode;
+	uint32_t	unused4;
+	uint32_t	uid;
+	uint32_t	gid;
+	uint32_t	unused5;
 };
 
 struct fuse_open_in {
-	__u32	flags;
-	__u32	unused;
+	uint32_t	flags;
+	uint32_t	unused;
 };
 
 struct fuse_create_in {
-	__u32	flags;
-	__u32	mode;
-	__u32	umask;
-	__u32	padding;
+	uint32_t	flags;
+	uint32_t	mode;
+	uint32_t	umask;
+	uint32_t	padding;
 };
 
 struct fuse_open_out {
-	__u64	fh;
-	__u32	open_flags;
-	__u32	padding;
+	uint64_t	fh;
+	uint32_t	open_flags;
+	uint32_t	padding;
 };
 
 struct fuse_release_in {
-	__u64	fh;
-	__u32	flags;
-	__u32	release_flags;
-	__u64	lock_owner;
+	uint64_t	fh;
+	uint32_t	flags;
+	uint32_t	release_flags;
+	uint64_t	lock_owner;
 };
 
 struct fuse_flush_in {
-	__u64	fh;
-	__u32	unused;
-	__u32	padding;
-	__u64	lock_owner;
+	uint64_t	fh;
+	uint32_t	unused;
+	uint32_t	padding;
+	uint64_t	lock_owner;
 };
 
 struct fuse_read_in {
-	__u64	fh;
-	__u64	offset;
-	__u32	size;
-	__u32	read_flags;
-	__u64	lock_owner;
-	__u32	flags;
-	__u32	padding;
+	uint64_t	fh;
+	uint64_t	offset;
+	uint32_t	size;
+	uint32_t	read_flags;
+	uint64_t	lock_owner;
+	uint32_t	flags;
+	uint32_t	padding;
 };
 
 #define FUSE_COMPAT_WRITE_IN_SIZE 24
 
 struct fuse_write_in {
-	__u64	fh;
-	__u64	offset;
-	__u32	size;
-	__u32	write_flags;
-	__u64	lock_owner;
-	__u32	flags;
-	__u32	padding;
+	uint64_t	fh;
+	uint64_t	offset;
+	uint32_t	size;
+	uint32_t	write_flags;
+	uint64_t	lock_owner;
+	uint32_t	flags;
+	uint32_t	padding;
 };
 
 struct fuse_write_out {
-	__u32	size;
-	__u32	padding;
+	uint32_t	size;
+	uint32_t	padding;
 };
 
 #define FUSE_COMPAT_STATFS_SIZE 48
@@ -413,32 +510,32 @@ struct fuse_statfs_out {
 };
 
 struct fuse_fsync_in {
-	__u64	fh;
-	__u32	fsync_flags;
-	__u32	padding;
+	uint64_t	fh;
+	uint32_t	fsync_flags;
+	uint32_t	padding;
 };
 
 struct fuse_setxattr_in {
-	__u32	size;
-	__u32	flags;
+	uint32_t	size;
+	uint32_t	flags;
 };
 
 struct fuse_getxattr_in {
-	__u32	size;
-	__u32	padding;
+	uint32_t	size;
+	uint32_t	padding;
 };
 
 struct fuse_getxattr_out {
-	__u32	size;
-	__u32	padding;
+	uint32_t	size;
+	uint32_t	padding;
 };
 
 struct fuse_lk_in {
-	__u64	fh;
-	__u64	owner;
+	uint64_t	fh;
+	uint64_t	owner;
 	struct fuse_file_lock lk;
-	__u32	lk_flags;
-	__u32	padding;
+	uint32_t	lk_flags;
+	uint32_t	padding;
 };
 
 struct fuse_lk_out {
@@ -446,134 +543,190 @@ struct fuse_lk_out {
 };
 
 struct fuse_access_in {
-	__u32	mask;
-	__u32	padding;
+	uint32_t	mask;
+	uint32_t	padding;
 };
 
 struct fuse_init_in {
-	__u32	major;
-	__u32	minor;
-	__u32	max_readahead;
-	__u32	flags;
+	uint32_t	major;
+	uint32_t	minor;
+	uint32_t	max_readahead;
+	uint32_t	flags;
 };
 
 struct fuse_init_out {
-	__u32	major;
-	__u32	minor;
-	__u32	max_readahead;
-	__u32	flags;
-	__u16   max_background;
-	__u16   congestion_threshold;
-	__u32	max_write;
+	uint32_t	major;
+	uint32_t	minor;
+	uint32_t	max_readahead;
+	uint32_t	flags;
+	uint16_t	max_background;
+	uint16_t	congestion_threshold;
+	uint32_t	max_write;
 };
 
 #define CUSE_INIT_INFO_MAX 4096
 
 struct cuse_init_in {
-	__u32	major;
-	__u32	minor;
-	__u32	unused;
-	__u32	flags;
+	uint32_t	major;
+	uint32_t	minor;
+	uint32_t	unused;
+	uint32_t	flags;
 };
 
 struct cuse_init_out {
-	__u32	major;
-	__u32	minor;
-	__u32	unused;
-	__u32	flags;
-	__u32	max_read;
-	__u32	max_write;
-	__u32	dev_major;		/* chardev major */
-	__u32	dev_minor;		/* chardev minor */
-	__u32	spare[10];
+	uint32_t	major;
+	uint32_t	minor;
+	uint32_t	unused;
+	uint32_t	flags;
+	uint32_t	max_read;
+	uint32_t	max_write;
+	uint32_t	dev_major;		/* chardev major */
+	uint32_t	dev_minor;		/* chardev minor */
+	uint32_t	spare[10];
 };
 
 struct fuse_interrupt_in {
-	__u64	unique;
+	uint64_t	unique;
 };
 
 struct fuse_bmap_in {
-	__u64	block;
-	__u32	blocksize;
-	__u32	padding;
+	uint64_t	block;
+	uint32_t	blocksize;
+	uint32_t	padding;
 };
 
 struct fuse_bmap_out {
-	__u64	block;
+	uint64_t	block;
 };
 
 struct fuse_ioctl_in {
-	__u64	fh;
-	__u32	flags;
-	__u32	cmd;
-	__u64	arg;
-	__u32	in_size;
-	__u32	out_size;
+	uint64_t	fh;
+	uint32_t	flags;
+	uint32_t	cmd;
+	uint64_t	arg;
+	uint32_t	in_size;
+	uint32_t	out_size;
+};
+
+struct fuse_ioctl_iovec {
+	uint64_t	base;
+	uint64_t	len;
 };
 
 struct fuse_ioctl_out {
-	__s32	result;
-	__u32	flags;
-	__u32	in_iovs;
-	__u32	out_iovs;
+	int32_t		result;
+	uint32_t	flags;
+	uint32_t	in_iovs;
+	uint32_t	out_iovs;
 };
 
 struct fuse_poll_in {
-	__u64	fh;
-	__u64	kh;
-	__u32	flags;
-	__u32   padding;
+	uint64_t	fh;
+	uint64_t	kh;
+	uint32_t	flags;
+	uint32_t	events;
 };
 
 struct fuse_poll_out {
-	__u32	revents;
-	__u32	padding;
+	uint32_t	revents;
+	uint32_t	padding;
 };
 
 struct fuse_notify_poll_wakeup_out {
-	__u64	kh;
+	uint64_t	kh;
+};
+
+struct fuse_fallocate_in {
+	uint64_t	fh;
+	uint64_t	offset;
+	uint64_t	length;
+	uint32_t	mode;
+	uint32_t	padding;
 };
 
 struct fuse_in_header {
-	__u32	len;
-	__u32	opcode;
-	__u64	unique;
-	__u64	nodeid;
-	__u32	uid;
-	__u32	gid;
-	__u32	pid;
-	__u32	padding;
+	uint32_t	len;
+	uint32_t	opcode;
+	uint64_t	unique;
+	uint64_t	nodeid;
+	uint32_t	uid;
+	uint32_t	gid;
+	uint32_t	pid;
+	uint32_t	padding;
 };
 
 struct fuse_out_header {
-	__u32	len;
-	__s32	error;
-	__u64	unique;
+	uint32_t	len;
+	int32_t		error;
+	uint64_t	unique;
 };
 
 struct fuse_dirent {
-	__u64	ino;
-	__u64	off;
-	__u32	namelen;
-	__u32	type;
-	char name[0];
+	uint64_t	ino;
+	uint64_t	off;
+	uint32_t	namelen;
+	uint32_t	type;
+	char name[];
 };
 
 #define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
-#define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1))
+#define FUSE_DIRENT_ALIGN(x) \
+	(((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1))
 #define FUSE_DIRENT_SIZE(d) \
 	FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
 
+struct fuse_direntplus {
+	struct fuse_entry_out entry_out;
+	struct fuse_dirent dirent;
+};
+
+#define FUSE_NAME_OFFSET_DIRENTPLUS \
+	offsetof(struct fuse_direntplus, dirent.name)
+#define FUSE_DIRENTPLUS_SIZE(d) \
+	FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen)
+
 struct fuse_notify_inval_inode_out {
-	__u64	ino;
-	__s64	off;
-	__s64	len;
+	uint64_t	ino;
+	int64_t		off;
+	int64_t		len;
 };
 
 struct fuse_notify_inval_entry_out {
-	__u64	parent;
-	__u32	namelen;
-	__u32	padding;
+	uint64_t	parent;
+	uint32_t	namelen;
+	uint32_t	padding;
+};
+
+struct fuse_notify_delete_out {
+	uint64_t	parent;
+	uint64_t	child;
+	uint32_t	namelen;
+	uint32_t	padding;
+};
+
+struct fuse_notify_store_out {
+	uint64_t	nodeid;
+	uint64_t	offset;
+	uint32_t	size;
+	uint32_t	padding;
+};
+
+struct fuse_notify_retrieve_out {
+	uint64_t	notify_unique;
+	uint64_t	nodeid;
+	uint64_t	offset;
+	uint32_t	size;
+	uint32_t	padding;
+};
+
+/* Matches the size of fuse_write_in */
+struct fuse_notify_retrieve_in {
+	uint64_t	dummy1;
+	uint64_t	offset;
+	uint32_t	size;
+	uint32_t	dummy2;
+	uint64_t	dummy3;
+	uint64_t	dummy4;
 };
 
 #endif /* _LINUX_FUSE_H */
diff --git a/contrib/fuse-util/mount_util.h b/contrib/fuse-include/mount_util.h
index f392f99f1..f392f99f1 100644
--- a/contrib/fuse-util/mount_util.h
+++ b/contrib/fuse-include/mount_util.h
diff --git a/contrib/fuse-lib/misc.c b/contrib/fuse-lib/misc.c
index 28a9284bf..0c41b1a19 100644
--- a/contrib/fuse-lib/misc.c
+++ b/contrib/fuse-lib/misc.c
@@ -50,5 +50,5 @@ convert_fuse_file_lock (struct fuse_file_lock *fl, struct gf_flock *flock,
         else
                 flock->l_len = fl->end - fl->start + 1;
         flock->l_pid = fl->pid;
-        flock->l_owner = lk_owner;
+        set_lk_owner_from_uint64 (&flock->l_owner, lk_owner);
 }
diff --git a/contrib/fuse-lib/mount-common.c b/contrib/fuse-lib/mount-common.c
new file mode 100644
index 000000000..fd6cce44e
--- /dev/null
+++ b/contrib/fuse-lib/mount-common.c
@@ -0,0 +1,265 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
+
+  This program can be distributed under the terms of the GNU LGPLv2.
+  See the file COPYING.LIB.
+*/
+
+#include "mount-gluster-compat.h"
+
+/*
+ * These functions (and gf_fuse_umount() in mount.c)
+ * were originally taken from libfuse as of commit 7960e99e
+ * (http://fuse.git.sourceforge.net/git/gitweb.cgi?p=fuse/fuse;a=commit;h=7960e99e)
+ * almost verbatim. What has been changed upon adoption:
+ * - style adopted to that of glusterfs
+ * - s/fprintf/gf_log/
+ * - s/free/FREE/, s/malloc/MALLOC/
+ * - there are some other minor things
+ *
+ * For changes that were made later and syncs with upstream,
+ * see the commit log and per-function comments.
+ */
+
+#ifndef __NetBSD__
+/* FUSE: cherry-picked bd99f9cf */
+static int
+mtab_needs_update (const char *mnt)
+{
+        int res;
+        struct stat stbuf;
+
+        /* If mtab is within new mount, don't touch it */
+        if (strncmp (mnt, _PATH_MOUNTED, strlen (mnt)) == 0 &&
+            _PATH_MOUNTED[strlen (mnt)] == '/')
+                return 0;
+
+        /*
+         * Skip mtab update if /etc/mtab:
+         *
+         *  - doesn't exist,
+         *  - is a symlink,
+         *  - is on a read-only filesystem.
+         */
+        res = lstat (_PATH_MOUNTED, &stbuf);
+        if (res == -1) {
+                if (errno == ENOENT)
+                        return 0;
+        } else {
+                uid_t ruid;
+                int err;
+
+                if (S_ISLNK (stbuf.st_mode))
+                        return 0;
+
+                ruid = getuid ();
+                if (ruid != 0)
+                        setreuid (0, -1);
+
+                res = access (_PATH_MOUNTED, W_OK);
+                err = (res == -1) ? errno : 0;
+                if (ruid != 0)
+                        setreuid (ruid, -1);
+
+                if (err == EROFS)
+                        return 0;
+        }
+
+        return 1;
+}
+#else /* __NetBSD__ */
+#define mtab_needs_update(x) 1
+#endif /* __NetBSD__ */
+
+/* FUSE: called add_mount_legacy(); R.I.P. as of cbd3a2a8 */
+int
+fuse_mnt_add_mount (const char *progname, const char *fsname,
+                    const char *mnt, const char *type, const char *opts)
+{
+        int res;
+        int status;
+        sigset_t blockmask;
+        sigset_t oldmask;
+
+        if (!mtab_needs_update (mnt))
+                return 0;
+
+        sigemptyset (&blockmask);
+        sigaddset (&blockmask, SIGCHLD);
+        res = sigprocmask (SIG_BLOCK, &blockmask, &oldmask);
+        if (res == -1) {
+                GFFUSE_LOGERR ("%s: sigprocmask: %s",
+                               progname, strerror (errno));
+                return -1;
+        }
+
+        res = fork ();
+        if (res == -1) {
+                GFFUSE_LOGERR ("%s: fork: %s", progname, strerror (errno));
+                goto out_restore;
+        }
+        if (res == 0) {
+                char templ[] = "/tmp/fusermountXXXXXX";
+                char *tmp;
+
+                sigprocmask (SIG_SETMASK, &oldmask, NULL);
+                setuid (geteuid ());
+
+                /*
+                 * hide in a directory, where mount isn't able to resolve
+                 * fsname as a valid path
+                 */
+                tmp = mkdtemp (templ);
+                if (!tmp) {
+                        GFFUSE_LOGERR ("%s: failed to create temporary directory",
+                                       progname);
+                        exit (1);
+                }
+                if (chdir (tmp)) {
+                        GFFUSE_LOGERR ("%s: failed to chdir to %s: %s",
+                                       progname, tmp, strerror (errno));
+                        exit (1);
+                }
+                rmdir (tmp);
+                execl (_PATH_MOUNT, _PATH_MOUNT, "-i", "-f", "-t", type,
+                       "-o", opts, fsname, mnt, NULL);
+                GFFUSE_LOGERR ("%s: failed to execute %s: %s",
+                               progname, _PATH_MOUNT, strerror (errno));
+                exit (1);
+        }
+
+        res = waitpid (res, &status, 0);
+        if (res == -1)
+                GFFUSE_LOGERR ("%s: waitpid: %s", progname, strerror (errno));
+        res = (res != -1 && status == 0) ? 0 : -1;
+
+ out_restore:
+        sigprocmask (SIG_SETMASK, &oldmask, NULL);
+        return res;
+}
+
+char *
+fuse_mnt_resolve_path (const char *progname, const char *orig)
+{
+        char buf[PATH_MAX];
+        char *copy;
+        char *dst;
+        char *end;
+        char *lastcomp;
+        const char *toresolv;
+
+        if (!orig[0]) {
+                GFFUSE_LOGERR ("%s: invalid mountpoint '%s'", progname, orig);
+                return NULL;
+        }
+
+        copy = strdup (orig);
+        if (copy == NULL) {
+                GFFUSE_LOGERR ("%s: failed to allocate memory", progname);
+                return NULL;
+        }
+
+        toresolv = copy;
+        lastcomp = NULL;
+        for (end = copy + strlen (copy) - 1; end > copy && *end == '/'; end --);
+        if (end[0] != '/') {
+                char *tmp;
+                end[1] = '\0';
+                tmp = strrchr (copy, '/');
+                if (tmp == NULL) {
+                        lastcomp = copy;
+                        toresolv = ".";
+                } else {
+                        lastcomp = tmp + 1;
+                        if (tmp == copy)
+                                toresolv = "/";
+                }
+                if (strcmp (lastcomp, ".") == 0 || strcmp (lastcomp, "..") == 0) {
+                        lastcomp = NULL;
+                        toresolv = copy;
+                }
+                else if (tmp)
+                        tmp[0] = '\0';
+        }
+        if (realpath (toresolv, buf) == NULL) {
+                GFFUSE_LOGERR ("%s: bad mount point %s: %s", progname, orig,
+                               strerror (errno));
+                FREE (copy);
+                return NULL;
+        }
+        if (lastcomp == NULL)
+                dst = strdup (buf);
+        else {
+                dst = (char *) MALLOC (strlen (buf) + 1 + strlen (lastcomp) + 1);
+                if (dst) {
+                        unsigned buflen = strlen (buf);
+                        if (buflen && buf[buflen-1] == '/')
+                                sprintf (dst, "%s%s", buf, lastcomp);
+                        else
+                                sprintf (dst, "%s/%s", buf, lastcomp);
+                }
+        }
+        FREE (copy);
+        if (dst == NULL)
+                GFFUSE_LOGERR ("%s: failed to allocate memory", progname);
+        return dst;
+}
+
+/* FUSE: to support some changes that were reverted since
+ * then, it was split in two (fuse_mnt_umount() and
+ * exec_umount()); however the actual code is same as here
+ * since 0197ce40
+ */
+int
+fuse_mnt_umount (const char *progname, const char *abs_mnt,
+                 const char *rel_mnt, int lazy)
+{
+        int res;
+        int status;
+        sigset_t blockmask;
+        sigset_t oldmask;
+
+        if (!mtab_needs_update (abs_mnt)) {
+                res = umount2 (rel_mnt, lazy ? 2 : 0);
+                if (res == -1)
+                        GFFUSE_LOGERR ("%s: failed to unmount %s: %s",
+                                       progname, abs_mnt, strerror (errno));
+                return res;
+        }
+
+        sigemptyset (&blockmask);
+        sigaddset (&blockmask, SIGCHLD);
+        res = sigprocmask (SIG_BLOCK, &blockmask, &oldmask);
+        if (res == -1) {
+                GFFUSE_LOGERR ("%s: sigprocmask: %s", progname,
+                               strerror (errno));
+                return -1;
+        }
+
+        res = fork ();
+        if (res == -1) {
+                GFFUSE_LOGERR ("%s: fork: %s", progname, strerror (errno));
+                goto out_restore;
+        }
+        if (res == 0) {
+                sigprocmask (SIG_SETMASK, &oldmask, NULL);
+                setuid (geteuid ());
+                execl ("/bin/umount", "/bin/umount", "-i", rel_mnt,
+                      lazy ? "-l" : NULL, NULL);
+                GFFUSE_LOGERR ("%s: failed to execute /bin/umount: %s",
+                               progname, strerror (errno));
+                exit (1);
+        }
+        res = waitpid (res, &status, 0);
+        if (res == -1)
+                GFFUSE_LOGERR ("%s: waitpid: %s", progname, strerror (errno));
+
+        if (status != 0)
+                res = -1;
+
+ out_restore:
+        sigprocmask (SIG_SETMASK, &oldmask, NULL);
+        return res;
+}
diff --git a/contrib/fuse-lib/mount-gluster-compat.h b/contrib/fuse-lib/mount-gluster-compat.h
new file mode 100644
index 000000000..4fc20623b
--- /dev/null
+++ b/contrib/fuse-lib/mount-gluster-compat.h
@@ -0,0 +1,56 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
+
+  This program can be distributed under the terms of the GNU LGPLv2.
+  See the file COPYING.LIB.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stddef.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <dirent.h>
+#include <signal.h>
+#ifndef __NetBSD__
+#include <mntent.h>
+#endif /* __NetBSD__ */
+#include <sys/stat.h>
+#include <sys/poll.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <sys/mount.h>
+
+#ifdef __NetBSD__
+#include <perfuse.h>
+#define umount2(dir, flags) unmount(dir, ((flags) != 0) ? MNT_FORCE : 0)
+#define MS_RDONLY MNT_RDONLY
+#endif
+
+#ifdef linux
+#define _PATH_MOUNT "/bin/mount"
+#else /* NetBSD, MacOS X */
+#define _PATH_MOUNT "/sbin/mount"
+#endif
+
+#ifdef FUSE_UTIL
+#define MALLOC(size) malloc (size)
+#define FREE(ptr) free (ptr)
+#define GFFUSE_LOGERR(...) fprintf (stderr, ## __VA_ARGS__)
+#else /* FUSE_UTIL */
+#include "glusterfs.h"
+#include "logging.h"
+#include "common-utils.h"
+
+#define GFFUSE_LOGERR(...) \
+        gf_log ("glusterfs-fuse", GF_LOG_ERROR, ## __VA_ARGS__)
+#endif /* !FUSE_UTIL */
diff --git a/contrib/fuse-lib/mount.c b/contrib/fuse-lib/mount.c
index 85a366894..922d9e464 100644
--- a/contrib/fuse-lib/mount.c
+++ b/contrib/fuse-lib/mount.c
@@ -7,335 +7,140 @@
   See the file COPYING.LIB.
 */
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <stddef.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <dirent.h>
-#ifndef __NetBSD__
-#include <mntent.h>
-#endif /* __NetBSD__ */
-#include <sys/stat.h>
-#include <sys/poll.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/wait.h>
-#include <sys/mount.h>
-
-#ifdef __NetBSD__
-#include <perfuse.h>
-#define umount2(dir, flags) unmount(dir, ((flags) != 0) ? MNT_FORCE : 0)
-#endif
-
-#ifdef FUSE_UTIL
-#define MALLOC(size) malloc (size)
-#define FREE(ptr) free (ptr)
-#define GFFUSE_LOGERR(...) fprintf (stderr, ## __VA_ARGS__)
-#else /* FUSE_UTIL */
-#include "glusterfs.h"
-#include "logging.h"
-#include "common-utils.h"
+#include "mount_util.h"
+#include "mount-gluster-compat.h"
 
 #ifdef GF_FUSERMOUNT
 #define FUSERMOUNT_PROG FUSERMOUNT_DIR "/fusermount-glusterfs"
 #else
 #define FUSERMOUNT_PROG "fusermount"
 #endif
-#define FUSE_COMMFD_ENV "_FUSE_COMMFD"
+#define FUSE_DEVFD_ENV "_FUSE_DEVFD"
 
-#define GFFUSE_LOGERR(...) \
-        gf_log ("glusterfs-fuse", GF_LOG_ERROR, ## __VA_ARGS__)
-#endif /* !FUSE_UTIL */
-
-/*
- * Functions below, until following note, were taken from libfuse
- * (http://git.gluster.com/?p=users/csaba/fuse.git;a=commit;h=b988bbf9)
- * almost verbatim. What has been changed:
- * - style adopted to that of glusterfs
- * - s/fprintf/gf_log/
- * - s/free/FREE/, s/malloc/MALLOC/
- * - there are some other minor things
- */
 
-#ifndef __NetBSD__
-static int
-mtab_needs_update (const char *mnt)
+/* FUSE: function is called fuse_kern_unmount() */
+void
+gf_fuse_unmount (const char *mountpoint, int fd)
 {
         int res;
-        struct stat stbuf;
-
-        /* If mtab is within new mount, don't touch it */
-        if (strncmp (mnt, _PATH_MOUNTED, strlen (mnt)) == 0 &&
-            _PATH_MOUNTED[strlen (mnt)] == '/')
-                return 0;
-
-        /*
-         * Skip mtab update if /etc/mtab:
-         *
-         *  - doesn't exist,
-         *  - is a symlink,
-         *  - is on a read-only filesystem.
-         */
-        res = lstat (_PATH_MOUNTED, &stbuf);
-        if (res == -1) {
-                if (errno == ENOENT)
-                        return 0;
-        } else {
-                if (S_ISLNK (stbuf.st_mode))
-                        return 0;
-
-                res = access (_PATH_MOUNTED, W_OK);
-                if (res == -1 && errno == EROFS)
-                        return 0;
-        }
+        int pid;
 
-        return 1;
-}
-#else /* __NetBSD__ */
-#define mtab_needs_update(x) 1
-#endif /* __NetBSD__ */
+        if (!mountpoint)
+                return;
 
-#ifndef FUSE_UTIL
-static
-#endif
-int
-fuse_mnt_add_mount (const char *progname, const char *fsname,
-                    const char *mnt, const char *type, const char *opts,
-                    pid_t *mtab_pid)
-{
-        int res;
-        int status;
-        sigset_t blockmask;
-        sigset_t oldmask;
-
-        if (!mtab_needs_update (mnt))
-                return 0;
-
-        sigemptyset (&blockmask);
-        sigaddset (&blockmask, SIGCHLD);
-        res = sigprocmask (SIG_BLOCK, &blockmask, &oldmask);
-        if (res == -1) {
-                GFFUSE_LOGERR ("%s: sigprocmask: %s",
-                               progname, strerror (errno));
-                return -1;
-        }
+        if (fd != -1) {
+                struct pollfd pfd;
 
-        res = fork ();
-        if (res == -1) {
-                GFFUSE_LOGERR ("%s: fork: %s", progname, strerror (errno));
-                goto out_restore;
-        }
-        if (res == 0) {
-                char templ[] = "/tmp/fusermountXXXXXX";
-                char *tmp;
-
-                if (!mtab_pid) {
-                        /* mtab update done async, just log if fails */
-                        res = fork ();
-                        if (res)
-                                exit (res == -1 ? 1 : 0);
-                        res = fork ();
-                        if (res) {
-                                if (res != -1) {
-                                        if (!(res == waitpid (res, &status, 0)
-                                              && status == 0))
-                                                GFFUSE_LOGERR ("%s: /etc/mtab "
-                                                               "update failed",
-                                                               progname);
-                                }
-                                exit (0);
-                        }
-                }
+                pfd.fd = fd;
+                pfd.events = 0;
+                res = poll (&pfd, 1, 0);
+                /* If file poll returns POLLERR on the device file descriptor,
+                   then the filesystem is already unmounted */
+                if (res == 1 && (pfd.revents & POLLERR))
+                        return;
 
-                sigprocmask (SIG_SETMASK, &oldmask, NULL);
-                setuid (geteuid ());
-
-                /*
-                 * hide in a directory, where mount isn't able to resolve
-                 * fsname as a valid path
-                 */
-                tmp = mkdtemp (templ);
-                if (!tmp) {
-                        GFFUSE_LOGERR ("%s: failed to create temporary directory",
-                                       progname);
-                        exit (1);
-                }
-                if (chdir (tmp)) {
-                        GFFUSE_LOGERR ("%s: failed to chdir to %s: %s",
-                                       progname, tmp, strerror (errno));
-                        exit (1);
-                }
-                rmdir (tmp);
-                execl ("/bin/mount", "/bin/mount", "-i", "-f", "-t", type,
-                       "-o", opts, fsname, mnt, NULL);
-                GFFUSE_LOGERR ("%s: failed to execute /bin/mount: %s",
-                               progname, strerror (errno));
-                exit (1);
+                /* Need to close file descriptor, otherwise synchronous umount
+                   would recurse into filesystem, and deadlock */
+                close (fd);
         }
-        if (mtab_pid) {
-                *mtab_pid = res;
-                res = 0;
-        } else {
-                if (!(res == waitpid (res, &status, 0) && status == 0))
-                        res = -1;
+
+        if (geteuid () == 0) {
+                fuse_mnt_umount ("fuse", mountpoint, mountpoint, 1);
+                return;
         }
-        if (res == -1)
-                GFFUSE_LOGERR ("%s: waitpid: %s", progname, strerror (errno));
 
- out_restore:
-        sigprocmask (SIG_SETMASK, &oldmask, NULL);
-        return res;
-}
+        res = umount2 (mountpoint, 2);
+        if (res == 0)
+                return;
 
-#ifndef FUSE_UTIL
-static
-#endif
-char
-*fuse_mnt_resolve_path (const char *progname, const char *orig)
-{
-        char buf[PATH_MAX];
-        char *copy;
-        char *dst;
-        char *end;
-        char *lastcomp;
-        const char *toresolv;
-
-        if (!orig[0]) {
-                GFFUSE_LOGERR ("%s: invalid mountpoint '%s'", progname, orig);
-                return NULL;
-        }
+        pid = fork ();
+        if (pid == -1)
+                return;
 
-        copy = strdup (orig);
-        if (copy == NULL) {
-                GFFUSE_LOGERR ("%s: failed to allocate memory", progname);
-                return NULL;
-        }
+        if (pid == 0) {
+                const char *argv[] = { FUSERMOUNT_PROG, "-u", "-q", "-z",
+                                       "--", mountpoint, NULL };
 
-        toresolv = copy;
-        lastcomp = NULL;
-        for (end = copy + strlen (copy) - 1; end > copy && *end == '/'; end --);
-        if (end[0] != '/') {
-                char *tmp;
-                end[1] = '\0';
-                tmp = strrchr (copy, '/');
-                if (tmp == NULL) {
-                        lastcomp = copy;
-                        toresolv = ".";
-                } else {
-                        lastcomp = tmp + 1;
-                        if (tmp == copy)
-                                toresolv = "/";
-                }
-                if (strcmp (lastcomp, ".") == 0 || strcmp (lastcomp, "..") == 0) {
-                        lastcomp = NULL;
-                        toresolv = copy;
-                }
-                else if (tmp)
-                        tmp[0] = '\0';
-        }
-        if (realpath (toresolv, buf) == NULL) {
-                GFFUSE_LOGERR ("%s: bad mount point %s: %s", progname, orig,
-                               strerror (errno));
-                FREE (copy);
-                return NULL;
-        }
-        if (lastcomp == NULL)
-                dst = strdup (buf);
-        else {
-                dst = (char *) MALLOC (strlen (buf) + 1 + strlen (lastcomp) + 1);
-                if (dst) {
-                        unsigned buflen = strlen (buf);
-                        if (buflen && buf[buflen-1] == '/')
-                                sprintf (dst, "%s%s", buf, lastcomp);
-                        else
-                                sprintf (dst, "%s/%s", buf, lastcomp);
-                }
+                execvp (FUSERMOUNT_PROG, (char **)argv);
+                _exit (1);
         }
-        FREE (copy);
-        if (dst == NULL)
-                GFFUSE_LOGERR ("%s: failed to allocate memory", progname);
-        return dst;
+        waitpid (pid, NULL, 0);
 }
 
-#ifndef FUSE_UTIL
-/* return value:
- * >= 0         => fd
- * -1         => error
- */
-static int
-receive_fd (int fd)
+
+/* gluster-specific routines */
+
+static char *
+escape (char *s)
 {
-        struct msghdr msg;
-        struct iovec iov;
-        char buf[1];
-        int rv;
-        size_t ccmsg[CMSG_SPACE (sizeof (int)) / sizeof (size_t)];
-        struct cmsghdr *cmsg;
-        int *recv_fd;
-
-        iov.iov_base = buf;
-        iov.iov_len = 1;
-
-        msg.msg_name = 0;
-        msg.msg_namelen = 0;
-        msg.msg_iov = &iov;
-        msg.msg_iovlen = 1;
-        /* old BSD implementations should use msg_accrights instead of
-         * msg_control; the interface is different. */
-        msg.msg_control = ccmsg;
-        msg.msg_controllen = sizeof (ccmsg);
-
-        while (((rv = recvmsg (fd, &msg, 0)) == -1) && errno == EINTR);
-        if (rv == -1) {
-                GFFUSE_LOGERR ("recvmsg failed: %s", strerror (errno));
-                return -1;
-        }
-        if (!rv) {
-                /* EOF */
-                return -1;
+        size_t len = 0;
+        char *p = NULL;
+        char *q = NULL;
+        char *e = NULL;
+
+        for (p = s; *p; p++) {
+                if (*p == ',')
+                       len++;
+                len++;
         }
 
-        cmsg = CMSG_FIRSTHDR (&msg);
-        /*
-         * simplify condition expression
-         */
-        if (cmsg->cmsg_type != SCM_RIGHTS) {
-                GFFUSE_LOGERR ("got control message of unknown type %d",
-                               cmsg->cmsg_type);
-                return -1;
+        e = CALLOC (1, len + 1);
+        if (!e)
+                return NULL;
+
+        for (p = s, q = e; *p; p++, q++) {
+                if (*p == ',') {
+                        *q = '\\';
+                        q++;
+                }
+                *q = *p;
         }
 
-        recv_fd = (int *) CMSG_DATA (cmsg);
-        return (*recv_fd);
+        return e;
 }
 
 static int
-fuse_mount_fusermount (const char *mountpoint, const char *opts)
+fuse_mount_fusermount (const char *mountpoint, char *fsname,
+                       unsigned long mountflags, char *mnt_param,
+                       int fd)
 {
-        int fds[2], pid;
-        int res;
-        int rv;
+        int  pid = -1;
+        int  res = 0;
+        int  ret = -1;
+        char *fm_mnt_params = NULL;
+        char *efsname = NULL;
+
+#ifndef GF_FUSERMOUNT
+        GFFUSE_LOGERR ("Mounting via helper utility "
+                       "(unprivileged mounting) is supported "
+                       "only if glusterfs is compiled with "
+                       "--enable-fusermount");
+        return -1;
+#endif
+
+        efsname = escape (fsname);
+        if (!efsname) {
+                GFFUSE_LOGERR ("Out of memory");
 
-        res = socketpair (PF_UNIX, SOCK_STREAM, 0, fds);
-        if (res == -1) {
-                GFFUSE_LOGERR ("socketpair() failed: %s", strerror (errno));
                 return -1;
         }
+        ret = asprintf (&fm_mnt_params,
+                        "%s%s,fsname=%s,nonempty,subtype=glusterfs",
+                        (mountflags & MS_RDONLY) ? "ro," : "",
+                        mnt_param, efsname);
+        FREE (efsname);
+        if (ret == -1) {
+                GFFUSE_LOGERR ("Out of memory");
+
+                goto out;
+        }
 
+        /* fork to exec fusermount */
         pid = fork ();
         if (pid == -1) {
                 GFFUSE_LOGERR ("fork() failed: %s", strerror (errno));
-                close (fds[0]);
-                close (fds[1]);
-                return -1;
+                ret = -1;
+                goto out;
         }
 
         if (pid == 0) {
@@ -344,214 +149,37 @@ fuse_mount_fusermount (const char *mountpoint, const char *opts)
                 int a = 0;
 
                 argv[a++] = FUSERMOUNT_PROG;
-                if (opts) {
-                        argv[a++] = "-o";
-                        argv[a++] = opts;
-                }
+                argv[a++] = "-o";
+                argv[a++] = fm_mnt_params;
                 argv[a++] = "--";
                 argv[a++] = mountpoint;
                 argv[a++] = NULL;
 
-                close (fds[1]);
-                fcntl (fds[0], F_SETFD, 0);
-                snprintf (env, sizeof (env), "%i", fds[0]);
-                setenv (FUSE_COMMFD_ENV, env, 1);
+                snprintf (env, sizeof (env), "%i", fd);
+                setenv (FUSE_DEVFD_ENV, env, 1);
                 execvp (FUSERMOUNT_PROG, (char **)argv);
                 GFFUSE_LOGERR ("failed to exec fusermount: %s",
                                strerror (errno));
                 _exit (1);
         }
 
-        close (fds[0]);
-        rv = receive_fd (fds[1]);
-        close (fds[1]);
-        waitpid (pid, NULL, 0); /* bury zombie */
-
-        return rv;
-}
-#endif
-
-#ifndef FUSE_UTIL
-static
-#endif
-int
-fuse_mnt_umount (const char *progname, const char *abs_mnt,
-                 const char *rel_mnt, int lazy)
-{
-        int res;
-        int status;
-        sigset_t blockmask;
-        sigset_t oldmask;
-
-        if (!mtab_needs_update (abs_mnt)) {
-                res = umount2 (rel_mnt, lazy ? 2 : 0);
-                if (res == -1)
-                        GFFUSE_LOGERR ("%s: failed to unmount %s: %s",
-                                       progname, abs_mnt, strerror (errno));
-                return res;
-        }
-
-        sigemptyset (&blockmask);
-        sigaddset (&blockmask, SIGCHLD);
-        res = sigprocmask (SIG_BLOCK, &blockmask, &oldmask);
-        if (res == -1) {
-                GFFUSE_LOGERR ("%s: sigprocmask: %s", progname,
-                               strerror (errno));
-                return -1;
-        }
-
-        res = fork ();
-        if (res == -1) {
-                GFFUSE_LOGERR ("%s: fork: %s", progname, strerror (errno));
-                goto out_restore;
-        }
-        if (res == 0) {
-                sigprocmask (SIG_SETMASK, &oldmask, NULL);
-                setuid (geteuid ());
-                execl ("/bin/umount", "/bin/umount", "-i", rel_mnt,
-                      lazy ? "-l" : NULL, NULL);
-                GFFUSE_LOGERR ("%s: failed to execute /bin/umount: %s",
-                               progname, strerror (errno));
-                exit (1);
-        }
-        res = waitpid (res, &status, 0);
-        if (res == -1)
-                GFFUSE_LOGERR ("%s: waitpid: %s", progname, strerror (errno));
-
-        if (status != 0)
-                res = -1;
-
- out_restore:
-        sigprocmask (SIG_SETMASK, &oldmask, NULL);
-        return res;
-}
-
-#ifdef FUSE_UTIL
-int
-fuse_mnt_check_empty (const char *progname, const char *mnt,
-                      mode_t rootmode, off_t rootsize)
-{
-        int isempty = 1;
-
-        if (S_ISDIR (rootmode)) {
-                struct dirent *ent;
-                DIR *dp = opendir (mnt);
-                if (dp == NULL) {
-                        fprintf (stderr,
-                                 "%s: failed to open mountpoint for reading: %s\n",
-                                 progname, strerror (errno));
-                        return -1;
-                }
-                while ((ent = readdir (dp)) != NULL) {
-                        if (strcmp (ent->d_name, ".") != 0 &&
-                            strcmp (ent->d_name, "..") != 0) {
-                                isempty = 0;
-                                break;
-                        }
-                }
-                closedir (dp);
-        } else if (rootsize)
-                isempty = 0;
-
-        if (!isempty) {
-                fprintf (stderr, "%s: mountpoint is not empty\n", progname);
-                fprintf (stderr, "%s: if you are sure this is safe, "
-                         "use the 'nonempty' mount option\n", progname);
-                return -1;
-        }
-        return 0;
-}
-
-int
-fuse_mnt_check_fuseblk (void)
-{
-        char buf[256];
-        FILE *f = fopen ("/proc/filesystems", "r");
-        if (!f)
-                return 1;
-
-        while (fgets (buf, sizeof (buf), f))
-                if (strstr (buf, "fuseblk\n")) {
-                        fclose (f);
-                        return 1;
-                }
-
-        fclose (f);
-        return 0;
-}
-#endif
-
-#ifndef FUSE_UTIL
-void
-gf_fuse_unmount (const char *mountpoint, int fd)
-{
-        int res;
-        int pid;
-
-        if (!mountpoint)
-                return;
-
-        if (fd != -1) {
-                struct pollfd pfd;
-
-                pfd.fd = fd;
-                pfd.events = 0;
-                res = poll (&pfd, 1, 0);
-                /* If file poll returns POLLERR on the device file descriptor,
-                   then the filesystem is already unmounted */
-                if (res == 1 && (pfd.revents & POLLERR))
-                        return;
-
-                /* Need to close file descriptor, otherwise synchronous umount
-                   would recurse into filesystem, and deadlock */
-                close (fd);
-        }
-
-        if (geteuid () == 0) {
-                fuse_mnt_umount ("fuse", mountpoint, mountpoint, 1);
-                return;
-        }
-
-        res = umount2 (mountpoint, 2);
-        if (res == 0)
-                return;
-
-        pid = fork ();
-        if (pid == -1)
-                return;
-
-        if (pid == 0) {
-                const char *argv[] = { FUSERMOUNT_PROG, "-u", "-q", "-z",
-                                       "--", mountpoint, NULL };
-
-                execvp (FUSERMOUNT_PROG, (char **)argv);
-                _exit (1);
-        }
-        waitpid (pid, NULL, 0);
+        ret = waitpid (pid, &res, 0);
+        ret = (ret == pid && res == 0) ? 0 : -1;
+ out:
+        FREE (fm_mnt_params);
+        return ret;
 }
-#endif
 
-/*
- * Functions below are loosely modelled after similar functions of libfuse
- */
-
-#ifndef FUSE_UTIL
 static int
-fuse_mount_sys (const char *mountpoint, char *fsname, char *mnt_param, pid_t *mtab_pid)
+fuse_mount_sys (const char *mountpoint, char *fsname,
+                unsigned long mountflags, char *mnt_param, int fd)
 {
-        int fd = -1, ret = -1;
+        int ret = -1;
         unsigned mounted = 0;
         char *mnt_param_mnt = NULL;
         char *fstype = "fuse.glusterfs";
         char *source = fsname;
 
-        fd = open ("/dev/fuse", O_RDWR);
-        if (fd == -1) {
-                GFFUSE_LOGERR ("cannot open /dev/fuse (%s)", strerror (errno));
-
-                return -1;
-        }
-
         ret = asprintf (&mnt_param_mnt,
                         "%s,fd=%i,rootmode=%o,user_id=%i,group_id=%i",
                         mnt_param, fd, S_IFDIR, getuid (), getgid ());
@@ -560,7 +188,7 @@ fuse_mount_sys (const char *mountpoint, char *fsname, char *mnt_param, pid_t *mt
 
                 goto out;
         }
-        ret = mount (source, mountpoint, fstype, 0,
+        ret = mount (source, mountpoint, fstype, mountflags,
                      mnt_param_mnt);
         if (ret == -1 && errno == ENODEV) {
                 /* fs subtype support was added by 79c0b2df aka
@@ -573,7 +201,7 @@ fuse_mount_sys (const char *mountpoint, char *fsname, char *mnt_param, pid_t *mt
 
                         goto out;
                 }
-                ret = mount (source, mountpoint, fstype, 0,
+                ret = mount (source, mountpoint, fstype, mountflags,
                              mnt_param_mnt);
         }
         if (ret == -1)
@@ -581,8 +209,10 @@ fuse_mount_sys (const char *mountpoint, char *fsname, char *mnt_param, pid_t *mt
         else
                 mounted = 1;
 
+#ifndef __NetBSD__
         if (geteuid () == 0) {
                 char *newmnt = fuse_mnt_resolve_path ("fuse", mountpoint);
+                char *mnt_param_mtab = NULL;
 
                 if (!newmnt) {
                         ret = -1;
@@ -590,8 +220,17 @@ fuse_mount_sys (const char *mountpoint, char *fsname, char *mnt_param, pid_t *mt
                         goto out;
                 }
 
-                ret = fuse_mnt_add_mount ("fuse", source, newmnt, fstype,
-                                          mnt_param, mtab_pid);
+                ret = asprintf (&mnt_param_mtab, "%s%s",
+                                mountflags & MS_RDONLY ? "ro," : "",
+                                mnt_param);
+                if (ret == -1)
+                        GFFUSE_LOGERR ("Out of memory");
+                else {
+                        ret = fuse_mnt_add_mount ("fuse", source, newmnt,
+                                                  fstype, mnt_param_mtab);
+                        FREE (mnt_param_mtab);
+                }
+
                 FREE (newmnt);
                 if (ret == -1) {
                         GFFUSE_LOGERR ("failed to add mtab entry");
@@ -599,95 +238,76 @@ fuse_mount_sys (const char *mountpoint, char *fsname, char *mnt_param, pid_t *mt
                         goto out;
                 }
         }
+#endif /* __NetBSD__ */
 
- out:
+out:
         if (ret == -1) {
                 if (mounted)
                         umount2 (mountpoint, 2); /* lazy umount */
-                close (fd);
-                fd = -1;
         }
         FREE (mnt_param_mnt);
         if (source != fsname)
                 FREE (source);
-        return fd;
-}
-
-static char *
-escape (char *s)
-{
-        size_t len = 0;
-        char *p = NULL;
-        char *q = NULL;
-        char *e = NULL;
 
-        for (p = s; *p; p++) {
-                if (*p == ',')
-                       len++;
-                len++;
-        }
-
-        e = CALLOC (1, len + 1);
-        if (!e)
-                return NULL;
-
-        for (p = s, q = e; *p; p++, q++) {
-                if (*p == ',') {
-                        *q = '\\';
-                        q++;
-                }
-                *q = *p;
-        }
-
-        return e;
+        return ret;
 }
 
 int
-gf_fuse_mount (const char *mountpoint, char *fsname, char *mnt_param,
-               pid_t *mtab_pid)
+gf_fuse_mount (const char *mountpoint, char *fsname,
+               unsigned long mountflags, char *mnt_param,
+               pid_t *mnt_pid, int status_fd)
 {
-        int fd = -1, rv = -1;
-        char *fm_mnt_params = NULL, *p = NULL;
-        char *efsname = NULL;
+        int   fd  = -1;
+        pid_t pid = -1;
+        int   ret = -1;
 
-        fd = fuse_mount_sys (mountpoint, fsname, mnt_param, mtab_pid);
+        fd = open ("/dev/fuse", O_RDWR);
         if (fd == -1) {
-                gf_log ("glusterfs-fuse", GF_LOG_INFO,
-                        "direct mount failed (%s), "
-                        "retry to mount via fusermount",
-                        strerror (errno));
-
-                efsname = escape (fsname);
-                if (!efsname) {
-                        GFFUSE_LOGERR ("Out of memory");
-
-                        return -1;
-                }
-                rv = asprintf (&fm_mnt_params,
-                               "%s,fsname=%s,nonempty,subtype=glusterfs",
-                               mnt_param, efsname);
-                FREE (efsname);
-                if (rv == -1) {
-                        GFFUSE_LOGERR ("Out of memory");
+                GFFUSE_LOGERR ("cannot open /dev/fuse (%s)",
+                                strerror (errno));
+                return -1;
+        }
 
-                        return -1;
+        /* start mount agent */
+        pid = fork();
+        switch (pid) {
+        case 0:
+                /* hello it's mount agent */
+                if (!mnt_pid) {
+                        /* daemonize mount agent, caller is
+                         * not interested in waiting for it
+                         */
+                        pid = fork ();
+                        if (pid)
+                                exit (pid == -1 ? 1 : 0);
                 }
 
-                fd = fuse_mount_fusermount (mountpoint, fm_mnt_params);
-                if (fd == -1) {
-                        p = fm_mnt_params + strlen (fm_mnt_params);
-                        while (*--p != ',');
-                        *p = '\0';
+                ret = fuse_mount_sys (mountpoint, fsname, mountflags, mnt_param, fd);
+                if (ret == -1) {
+                        gf_log ("glusterfs-fuse", GF_LOG_INFO,
+                                "direct mount failed (%s), "
+                                "retry to mount via fusermount",
+                                strerror (errno));
 
-                        fd = fuse_mount_fusermount (mountpoint, fm_mnt_params);
+                        ret = fuse_mount_fusermount (mountpoint, fsname, mountflags,
+                                                     mnt_param, fd);
                 }
 
-                FREE (fm_mnt_params);
+                if (ret == -1)
+                        GFFUSE_LOGERR ("mount of %s to %s (%s) failed",
+                                       fsname, mountpoint, mnt_param);
 
-                if (fd == -1)
-                       GFFUSE_LOGERR ("mount failed");
+                if (status_fd >= 0)
+                        (void)write (status_fd, &ret, sizeof (ret));
+                exit (!!ret);
+                /* bye mount agent */
+        case -1:
+                close (fd);
+                fd = -1;
         }
 
+        if (mnt_pid)
+               *mnt_pid = pid;
+
         return fd;
 }
-#endif
diff --git a/contrib/fuse-util/Makefile.am b/contrib/fuse-util/Makefile.am
index 42609a688..971d3d220 100644
--- a/contrib/fuse-util/Makefile.am
+++ b/contrib/fuse-util/Makefile.am
@@ -1,9 +1,11 @@
 bin_PROGRAMS = fusermount-glusterfs
 
-fusermount_glusterfs_SOURCES = fusermount.c $(CONTRIBDIR)/fuse-lib/mount.c
-noinst_HEADERS = mount_util.h
+fusermount_glusterfs_SOURCES = fusermount.c mount_util.c $(CONTRIBDIR)/fuse-lib/mount-common.c
+noinst_HEADERS = $(CONTRIBDIR)/fuse-include/mount_util.h
 
-AM_CFLAGS = -Wall -D_FILE_OFFSET_BITS=64 -DFUSE_UTIL $(GF_CFLAGS)
+AM_CPPFLAGS = $(GF_CPPFLAGS) -DFUSE_UTIL -I$(CONTRIBDIR)/fuse-include
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
 
 install-exec-hook:
 	-chown root $(DESTDIR)$(bindir)/fusermount-glusterfs
diff --git a/contrib/fuse-util/fusermount.c b/contrib/fuse-util/fusermount.c
index 39da9b6a0..0ff8d9039 100644
--- a/contrib/fuse-util/fusermount.c
+++ b/contrib/fuse-util/fusermount.c
@@ -19,6 +19,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <pwd.h>
+#include <limits.h>
 #include <mntent.h>
 #include <sys/wait.h>
 #include <sys/stat.h>
@@ -28,6 +29,7 @@
 #include <sys/utsname.h>
 #include <sched.h>
 
+#define FUSE_DEVFD_ENV		"_FUSE_DEVFD"
 #define FUSE_COMMFD_ENV		"_FUSE_COMMFD"
 
 #define FUSE_DEV_OLD "/proc/fs/fuse/dev"
@@ -114,8 +116,16 @@ static int lock_umount(void)
 
 static void unlock_umount(int mtablock)
 {
-	lockf(mtablock, F_ULOCK, 0);
-	close(mtablock);
+	if (mtablock >= 0) {
+		int res;
+
+		res = lockf(mtablock, F_ULOCK, 0);
+		if (res < 0) {
+			fprintf(stderr, "%s: error releasing lock: %s\n",
+				progname, strerror(errno));
+		}
+		close(mtablock);
+	}
 }
 
 static int add_mount(const char *source, const char *mnt, const char *type,
@@ -238,7 +248,7 @@ static int check_is_mount_child(void *p)
 	}
 
 	count = 0;
-	while ((entp = getmntent(fp)) != NULL)
+	while (getmntent(fp) != NULL)
 		count++;
 	endmntent(fp);
 
@@ -325,7 +335,7 @@ static int check_is_mount(const char *last, const char *mnt)
 	return 0;
 }
 
-static int chdir_to_parent(char *copy, const char **lastp, int *currdir_fd)
+static int chdir_to_parent(char *copy, const char **lastp)
 {
 	char *tmp;
 	const char *parent;
@@ -350,14 +360,6 @@ static int chdir_to_parent(char *copy, const char **lastp, int *currdir_fd)
 		parent = "/";
 	}
 
-	*currdir_fd = open(".", O_RDONLY);
-	if (*currdir_fd == -1) {
-		fprintf(stderr,
-			"%s: failed to open current directory: %s\n",
-			progname, strerror(errno));
-		return -1;
-	}
-
 	res = chdir(parent);
 	if (res == -1) {
 		fprintf(stderr, "%s: failed to chdir to %s: %s\n",
@@ -382,7 +384,6 @@ static int chdir_to_parent(char *copy, const char **lastp, int *currdir_fd)
 
 static int unmount_fuse_locked(const char *mnt, int quiet, int lazy)
 {
-	int currdir_fd = -1;
 	char *copy;
 	const char *last;
 	int res;
@@ -399,7 +400,7 @@ static int unmount_fuse_locked(const char *mnt, int quiet, int lazy)
 		return -1;
 	}
 
-	res = chdir_to_parent(copy, &last, &currdir_fd);
+	res = chdir_to_parent(copy, &last);
 	if (res == -1)
 		goto out;
 
@@ -411,10 +412,6 @@ static int unmount_fuse_locked(const char *mnt, int quiet, int lazy)
 
 out:
 	free(copy);
-	if (currdir_fd != -1) {
-		fchdir(currdir_fd);
-		close(currdir_fd);
-	}
 
 	return res;
 }
@@ -508,13 +505,13 @@ static void read_conf(void)
 		int isnewline = 1;
 		while (fgets(line, sizeof(line), fp) != NULL) {
 			if (isnewline) {
-				if (line[strlen(line)-1] == '\n') {
+				if (strlen(line) && line[strlen(line)-1] == '\n') {
 					strip_line(line);
 					parse_line(line, linenum);
 				} else {
 					isnewline = 0;
 				}
-			} else if(line[strlen(line)-1] == '\n') {
+			} else if(strlen(line) && line[strlen(line)-1] == '\n') {
 				fprintf(stderr, "%s: reading %s: line %i too long\n", progname, FUSE_CONF, linenum);
 
 				isnewline = 1;
@@ -609,7 +606,7 @@ static int add_option(char **optsp, const char *opt, unsigned expand)
 static int get_mnt_opts(int flags, char *opts, char **mnt_optsp)
 {
 	int i;
-	int l;
+	size_t l;
 
 	if (!(flags & MS_RDONLY) && add_option(mnt_optsp, "rw", 0) == -1)
 		return -1;
@@ -624,7 +621,7 @@ static int get_mnt_opts(int flags, char *opts, char **mnt_optsp)
 		return -1;
 	/* remove comma from end of opts*/
 	l = strlen(*mnt_optsp);
-	if ((*mnt_optsp)[l-1] == ',')
+	if (l && (*mnt_optsp)[l-1] == ',')
 		(*mnt_optsp)[l-1] = '\0';
 	if (getuid() != 0) {
 		const char *user = get_user_name();
@@ -653,8 +650,7 @@ static int get_string_opt(const char *s, unsigned len, const char *opt,
 	unsigned opt_len = strlen(opt);
 	char *d;
 
-	if (*val)
-		free(*val);
+	free(*val);
 	*val = (char *) malloc(len - opt_len + 1);
 	if (!*val) {
 		fprintf(stderr, "%s: failed to allocate memory\n", progname);
@@ -823,15 +819,14 @@ static int do_mount(const char *mnt, char **typep, mode_t rootmode,
 			fprintf(stderr, "%s: mount failed: %s\n", progname,
 				strerror(errno_save));
 		goto err;
-	} else {
-		*sourcep = source;
-		*typep = type;
-		*mnt_optsp = mnt_opts;
 	}
+	*sourcep = source;
+	*typep = type;
+	*mnt_optsp = mnt_opts;
 	free(fsname);
 	free(optbuf);
 
-	return res;
+	return 0;
 
 err:
 	free(fsname);
@@ -874,8 +869,7 @@ static int check_version(const char *dev)
 	return 0;
 }
 
-static int check_perm(const char **mntp, struct stat *stbuf, int *currdir_fd,
-		      int *mountpoint_fd)
+static int check_perm(const char **mntp, struct stat *stbuf, int *mountpoint_fd)
 {
 	int res;
 	const char *mnt = *mntp;
@@ -893,13 +887,6 @@ static int check_perm(const char **mntp, struct stat *stbuf, int *currdir_fd,
 		return 0;
 
 	if (S_ISDIR(stbuf->st_mode)) {
-		*currdir_fd = open(".", O_RDONLY);
-		if (*currdir_fd == -1) {
-			fprintf(stderr,
-				"%s: failed to open current directory: %s\n",
-				progname, strerror(errno));
-			return -1;
-		}
 		res = chdir(mnt);
 		if (res == -1) {
 			fprintf(stderr,
@@ -1016,8 +1003,36 @@ static int open_fuse_device(char **devp)
 	return -1;
 }
 
+static int check_fuse_device(char *devfd, char **devp)
+{
+	int res;
+	char *devlink;
 
-static int mount_fuse(const char *mnt, const char *opts)
+	res = asprintf(&devlink, "/proc/self/fd/%s", devfd);
+	if (res == -1) {
+		fprintf(stderr, "%s: failed to allocate memory\n", progname);
+		return -1;
+	}
+
+	*devp = (char *) calloc(1, PATH_MAX + 1);
+	if (!*devp) {
+		fprintf(stderr, "%s: failed to allocate memory\n", progname);
+		free(devlink);
+		return -1;
+	}
+
+	res = readlink (devlink, *devp, PATH_MAX);
+	free (devlink);
+	if (res == -1) {
+		fprintf(stderr, "%s: specified fuse fd is invalid\n",
+			progname);
+		return -1;
+	}
+
+	return atoi(devfd);
+}
+
+static int mount_fuse(const char *mnt, const char *opts, char *devfd)
 {
 	int res;
 	int fd;
@@ -1027,10 +1042,9 @@ static int mount_fuse(const char *mnt, const char *opts)
 	char *source = NULL;
 	char *mnt_opts = NULL;
 	const char *real_mnt = mnt;
-	int currdir_fd = -1;
 	int mountpoint_fd = -1;
 
-	fd = open_fuse_device(&dev);
+	fd = devfd ? check_fuse_device(devfd, &dev) : open_fuse_device(&dev);
 	if (fd == -1)
 		return -1;
 
@@ -1041,15 +1055,13 @@ static int mount_fuse(const char *mnt, const char *opts)
 		int mount_count = count_fuse_fs();
 		if (mount_count >= mount_max) {
 			fprintf(stderr, "%s: too many FUSE filesystems mounted; mount_max=N can be set in /etc/fuse.conf\n", progname);
-			close(fd);
-			return -1;
+			goto fail_close_fd;
 		}
 	}
 
 	res = check_version(dev);
 	if (res != -1) {
-		res = check_perm(&real_mnt, &stbuf, &currdir_fd,
-				 &mountpoint_fd);
+		res = check_perm(&real_mnt, &stbuf, &mountpoint_fd);
 		restore_privs();
 		if (res != -1)
 			res = do_mount(real_mnt, &type, stbuf.st_mode & S_IFMT,
@@ -1058,33 +1070,38 @@ static int mount_fuse(const char *mnt, const char *opts)
 	} else
 		restore_privs();
 
-	if (currdir_fd != -1) {
-		fchdir(currdir_fd);
-		close(currdir_fd);
-	}
 	if (mountpoint_fd != -1)
 		close(mountpoint_fd);
 
+	if (res == -1)
+		goto fail_close_fd;
+
+	res = chdir("/");
 	if (res == -1) {
-		close(fd);
-		return -1;
+		fprintf(stderr, "%s: failed to chdir to '/'\n", progname);
+		goto fail_close_fd;
 	}
 
 	if (geteuid() == 0) {
 		res = add_mount(source, mnt, type, mnt_opts);
 		if (res == -1) {
-			umount2(mnt, 2); /* lazy umount */
-			close(fd);
-			return -1;
+			/* Can't clean up mount in a non-racy way */
+			goto fail_close_fd;
 		}
 	}
 
+out_free:
 	free(source);
 	free(type);
 	free(mnt_opts);
 	free(dev);
 
 	return fd;
+
+fail_close_fd:
+	close(fd);
+	fd = -1;
+	goto out_free;
 }
 
 static int send_fd(int sock_fd, int fd)
@@ -1154,6 +1171,7 @@ int main(int argc, char *argv[])
 	static int unmount = 0;
 	static int lazy = 0;
 	static int quiet = 0;
+	char *devfd;
 	char *commfd;
 	int cfd;
 	const char *opts = "";
@@ -1222,6 +1240,13 @@ int main(int argc, char *argv[])
 
 	drop_privs();
 	mnt = fuse_mnt_resolve_path(progname, origmnt);
+	if (mnt != NULL) {
+		res = chdir("/");
+		if (res == -1) {
+			fprintf(stderr, "%s: failed to chdir to '/'\n", progname);
+			exit(1);
+		}
+	}
 	restore_privs();
 	if (mnt == NULL)
 		exit(1);
@@ -1242,21 +1267,26 @@ int main(int argc, char *argv[])
 		return 0;
 	}
 
-	commfd = getenv(FUSE_COMMFD_ENV);
-	if (commfd == NULL) {
-		fprintf(stderr, "%s: old style mounting not supported\n",
-			progname);
-		exit(1);
+	devfd = getenv(FUSE_DEVFD_ENV);
+	if (devfd == NULL) {
+		commfd = getenv(FUSE_COMMFD_ENV);
+		if (commfd == NULL) {
+			fprintf(stderr, "%s: old style mounting not supported\n",
+				progname);
+			exit(1);
+		}
 	}
 
-	fd = mount_fuse(mnt, opts);
+	fd = mount_fuse(mnt, opts, devfd);
 	if (fd == -1)
 		exit(1);
 
-	cfd = atoi(commfd);
-	res = send_fd(cfd, fd);
-	if (res == -1)
-		exit(1);
+	if (devfd == NULL) {
+		cfd = atoi(commfd);
+		res = send_fd(cfd, fd);
+		if (res == -1)
+			exit(1);
+	}
 
 	return 0;
 }
diff --git a/contrib/fuse-util/mount_util.c b/contrib/fuse-util/mount_util.c
new file mode 100644
index 000000000..911b84445
--- /dev/null
+++ b/contrib/fuse-util/mount_util.c
@@ -0,0 +1,64 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU LGPLv2.
+  See the file COPYING.LIB.
+*/
+
+#include <dirent.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+int fuse_mnt_check_empty(const char *progname, const char *mnt,
+			 mode_t rootmode, off_t rootsize)
+{
+	int isempty = 1;
+
+	if (S_ISDIR(rootmode)) {
+		struct dirent *ent;
+		DIR *dp = opendir(mnt);
+		if (dp == NULL) {
+			fprintf(stderr,
+				"%s: failed to open mountpoint for reading: %s\n",
+				progname, strerror(errno));
+			return -1;
+		}
+		while ((ent = readdir(dp)) != NULL) {
+			if (strcmp(ent->d_name, ".") != 0 &&
+			    strcmp(ent->d_name, "..") != 0) {
+				isempty = 0;
+				break;
+			}
+		}
+		closedir(dp);
+	} else if (rootsize)
+		isempty = 0;
+
+	if (!isempty) {
+		fprintf(stderr, "%s: mountpoint is not empty\n", progname);
+		fprintf(stderr, "%s: if you are sure this is safe, use the 'nonempty' mount option\n", progname);
+		return -1;
+	}
+	return 0;
+}
+
+int fuse_mnt_check_fuseblk(void)
+{
+	char buf[256];
+	FILE *f = fopen("/proc/filesystems", "r");
+	if (!f)
+		return 1;
+
+	while (fgets(buf, sizeof(buf), f))
+		if (strstr(buf, "fuseblk\n")) {
+			fclose(f);
+			return 1;
+		}
+
+	fclose(f);
+	return 0;
+}
diff --git a/contrib/libgen/basename_r.c b/contrib/libgen/basename_r.c
new file mode 100644
index 000000000..2c3a87afe
--- /dev/null
+++ b/contrib/libgen/basename_r.c
@@ -0,0 +1,40 @@
+/*
+ * borrowed from glibc-2.12.1/string/basename.c
+ * Modified to return "." for NULL or "", as required for SUSv2.
+ */
+#include <string.h>
+#include <stdlib.h>
+#ifdef THREAD_UNSAFE_BASENAME
+
+/* Return the name-within-directory of a file name.
+   Copyright (C) 1996,97,98,2002 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+char *
+basename_r (filename)
+     const char *filename;
+{
+  char *p;
+
+  if ((filename == NULL) || (*filename == '\0'))
+    return ".";
+
+  p = strrchr (filename, '/');
+  return p ? p + 1 : (char *) filename;
+}
+#endif /* THREAD_UNSAFE_BASENAME */
diff --git a/contrib/libgen/dirname_r.c b/contrib/libgen/dirname_r.c
new file mode 100644
index 000000000..131cbcf2a
--- /dev/null
+++ b/contrib/libgen/dirname_r.c
@@ -0,0 +1,243 @@
+/*
+ * Borrowed from glibc-2.12.1/string/memrchr.c
+ * Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
+ * Removed code for long bigger than 32 bytes, renamed __ptr_t as void *
+ * changed reg_char type to char.
+ */
+#include <string.h>
+#include <stdlib.h>
+#ifdef THREAD_UNSAFE_DIRNAME
+
+/* memrchr -- find the last occurrence of a byte in a memory block
+   Copyright (C) 1991, 93, 96, 97, 99, 2000 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
+   with help from Dan Sahlin (dan@sics.se) and
+   commentary by Jim Blandy (jimb@ai.mit.edu);
+   adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
+   and implemented by Roland McGrath (roland@ai.mit.edu).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+void *
+__memrchr (s, c_in, n)
+     const void * s;
+     int c_in;
+     size_t n;
+{
+  const unsigned char *char_ptr;
+  const unsigned long int *longword_ptr;
+  unsigned long int longword, magic_bits, charmask;
+  unsigned char c;
+
+  c = (unsigned char) c_in;
+
+  /* Handle the last few characters by reading one character at a time.
+     Do this until CHAR_PTR is aligned on a longword boundary.  */
+  for (char_ptr = (const unsigned char *) s + n;
+       n > 0 && ((unsigned long int) char_ptr
+		 & (sizeof (longword) - 1)) != 0;
+       --n)
+    if (*--char_ptr == c)
+      return (void *) char_ptr;
+
+  /* All these elucidatory comments refer to 4-byte longwords,
+     but the theory applies equally well to 8-byte longwords.  */
+
+  longword_ptr = (const unsigned long int *) char_ptr;
+
+  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
+     the "holes."  Note that there is a hole just to the left of
+     each byte, with an extra at the end:
+
+     bits:  01111110 11111110 11111110 11111111
+     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
+
+     The 1-bits make sure that carries propagate to the next 0-bit.
+     The 0-bits provide holes for carries to fall into.  */
+
+  if (sizeof (longword) != 4 && sizeof (longword) != 8)
+    abort ();
+
+  magic_bits = 0x7efefeff;
+
+  /* Set up a longword, each of whose bytes is C.  */
+  charmask = c | (c << 8);
+  charmask |= charmask << 16;
+
+  /* Instead of the traditional loop which tests each character,
+     we will test a longword at a time.  The tricky part is testing
+     if *any of the four* bytes in the longword in question are zero.  */
+  while (n >= sizeof (longword))
+    {
+      /* We tentatively exit the loop if adding MAGIC_BITS to
+	 LONGWORD fails to change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.  If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-30 is set, there will be a carry
+	 into bit 31, so all of the hole bits will be changed.
+
+	 The one misfire occurs when bits 24-30 are clear and bit
+	 31 is set; in this case, the hole at bit 31 is not
+	 changed.  If we had access to the processor carry flag,
+	 we could close this loophole by putting the fourth hole
+	 at bit 32!
+
+	 So it ignores everything except 128's, when they're aligned
+	 properly.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+      longword = *--longword_ptr ^ charmask;
+
+      /* Add MAGIC_BITS to LONGWORD.  */
+      if ((((longword + magic_bits)
+
+	    /* Set those bits that were unchanged by the addition.  */
+	    ^ ~longword)
+
+	   /* Look at only the hole bits.  If any of the hole bits
+	      are unchanged, most likely one of the bytes was a
+	      zero.  */
+	   & ~magic_bits) != 0)
+	{
+	  /* Which of the bytes was C?  If none of them were, it was
+	     a misfire; continue the search.  */
+
+	  const unsigned char *cp = (const unsigned char *) longword_ptr;
+
+	  if (cp[3] == c)
+	    return (void *) &cp[3];
+	  if (cp[2] == c)
+	    return (void *) &cp[2];
+	  if (cp[1] == c)
+	    return (void *) &cp[1];
+	  if (cp[0] == c)
+	    return (void *) cp;
+	}
+
+      n -= sizeof (longword);
+    }
+
+  char_ptr = (const unsigned char *) longword_ptr;
+
+  while (n-- > 0)
+    {
+      if (*--char_ptr == c)
+	return (void *) char_ptr;
+    }
+
+  return 0;
+}
+
+/*
+ * Borrowed from glibc-2.12.1/misc/dirname.c
+ */
+
+/* dirname - return directory part of PATH.
+   Copyright (C) 1996, 2000, 2001, 2002 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+char *
+dirname_r (char *path)
+{
+  static const char dot[] = ".";
+  char *last_slash;
+
+  /* Find last '/'.  */
+  last_slash = path != NULL ? strrchr (path, '/') : NULL;
+
+  if (last_slash != NULL && last_slash != path && last_slash[1] == '\0')
+    {
+      /* Determine whether all remaining characters are slashes.  */
+      char *runp;
+
+      for (runp = last_slash; runp != path; --runp)
+        if (runp[-1] != '/')
+          break;
+
+      /* The '/' is the last character, we have to look further.  */
+      if (runp != path)
+        last_slash = __memrchr (path, '/', runp - path);
+    }
+
+  if (last_slash != NULL)
+    {
+      /* Determine whether all remaining characters are slashes.  */
+      char *runp;
+
+      for (runp = last_slash; runp != path; --runp)
+        if (runp[-1] != '/')
+          break;
+
+      /* Terminate the path.  */
+      if (runp == path)
+        {
+          /* The last slash is the first character in the string.  We have to
+             return "/".  As a special case we have to return "//" if there
+             are exactly two slashes at the beginning of the string.  See
+             XBD 4.10 Path Name Resolution for more information.  */
+          if (last_slash == path + 1)
+            ++last_slash;
+          else
+            last_slash = path + 1;
+        }
+      else
+        last_slash = runp;
+
+      last_slash[0] = '\0';
+    }
+  else
+    /* This assignment is ill-designed but the XPG specs require to
+       return a string containing "." in any case no directory part is
+       found and so a static and constant string is required.  */
+    path = (char *) dot;
+
+  return path;
+}
+#endif /* THREAD_UNSAFE_DIRNAME */
diff --git a/contrib/md5/md5.c b/contrib/md5/md5.c
deleted file mode 100644
index 5f0d0d157..000000000
--- a/contrib/md5/md5.c
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * RFC 1321 compliant MD5 implementation
- *
- * Copyright (C) 2001-2003 Christophe Devine
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, visit the http://fsf.org website.
- */
-
-
-#include <inttypes.h>
-#include <string.h>
-
-#include "md5.h"
-
-void md5_begin(md_context *ctx)
-{
-	ctx->A = 0x67452301;
-	ctx->B = 0xEFCDAB89;
-	ctx->C = 0x98BADCFE;
-	ctx->D = 0x10325476;
-
-	ctx->totalN = ctx->totalN2 = 0;
-}
-
-static void md5_process(md_context *ctx, const uint8_t data[CSUM_CHUNK])
-{
-	uint32_t X[16], A, B, C, D;
-
-	A = ctx->A;
-	B = ctx->B;
-	C = ctx->C;
-	D = ctx->D;
-
-	X[0] = IVAL(data, 0);
-	X[1] = IVAL(data, 4);
-	X[2] = IVAL(data, 8);
-	X[3] = IVAL(data, 12);
-	X[4] = IVAL(data, 16);
-	X[5] = IVAL(data, 20);
-	X[6] = IVAL(data, 24);
-	X[7] = IVAL(data, 28);
-	X[8] = IVAL(data, 32);
-	X[9] = IVAL(data, 36);
-	X[10] = IVAL(data, 40);
-	X[11] = IVAL(data, 44);
-	X[12] = IVAL(data, 48);
-	X[13] = IVAL(data, 52);
-	X[14] = IVAL(data, 56);
-	X[15] = IVAL(data, 60);
-
-#define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n)))
-
-#define P(a,b,c,d,k,s,t) a += F(b,c,d) + X[k] + t, a = S(a,s) + b
-
-#define F(x,y,z) (z ^ (x & (y ^ z)))
-
-	P(A, B, C, D,  0,  7, 0xD76AA478);
-	P(D, A, B, C,  1, 12, 0xE8C7B756);
-	P(C, D, A, B,  2, 17, 0x242070DB);
-	P(B, C, D, A,  3, 22, 0xC1BDCEEE);
-	P(A, B, C, D,  4,  7, 0xF57C0FAF);
-	P(D, A, B, C,  5, 12, 0x4787C62A);
-	P(C, D, A, B,  6, 17, 0xA8304613);
-	P(B, C, D, A,  7, 22, 0xFD469501);
-	P(A, B, C, D,  8,  7, 0x698098D8);
-	P(D, A, B, C,  9, 12, 0x8B44F7AF);
-	P(C, D, A, B, 10, 17, 0xFFFF5BB1);
-	P(B, C, D, A, 11, 22, 0x895CD7BE);
-	P(A, B, C, D, 12,  7, 0x6B901122);
-	P(D, A, B, C, 13, 12, 0xFD987193);
-	P(C, D, A, B, 14, 17, 0xA679438E);
-	P(B, C, D, A, 15, 22, 0x49B40821);
-
-#undef F
-#define F(x,y,z) (y ^ (z & (x ^ y)))
-
-	P(A, B, C, D,  1,  5, 0xF61E2562);
-	P(D, A, B, C,  6,  9, 0xC040B340);
-	P(C, D, A, B, 11, 14, 0x265E5A51);
-	P(B, C, D, A,  0, 20, 0xE9B6C7AA);
-	P(A, B, C, D,  5,  5, 0xD62F105D);
-	P(D, A, B, C, 10,  9, 0x02441453);
-	P(C, D, A, B, 15, 14, 0xD8A1E681);
-	P(B, C, D, A,  4, 20, 0xE7D3FBC8);
-	P(A, B, C, D,  9,  5, 0x21E1CDE6);
-	P(D, A, B, C, 14,  9, 0xC33707D6);
-	P(C, D, A, B,  3, 14, 0xF4D50D87);
-	P(B, C, D, A,  8, 20, 0x455A14ED);
-	P(A, B, C, D, 13,  5, 0xA9E3E905);
-	P(D, A, B, C,  2,  9, 0xFCEFA3F8);
-	P(C, D, A, B,  7, 14, 0x676F02D9);
-	P(B, C, D, A, 12, 20, 0x8D2A4C8A);
-
-#undef F
-#define F(x,y,z) (x ^ y ^ z)
-
-	P(A, B, C, D,  5,  4, 0xFFFA3942);
-	P(D, A, B, C,  8, 11, 0x8771F681);
-	P(C, D, A, B, 11, 16, 0x6D9D6122);
-	P(B, C, D, A, 14, 23, 0xFDE5380C);
-	P(A, B, C, D,  1,  4, 0xA4BEEA44);
-	P(D, A, B, C,  4, 11, 0x4BDECFA9);
-	P(C, D, A, B,  7, 16, 0xF6BB4B60);
-	P(B, C, D, A, 10, 23, 0xBEBFBC70);
-	P(A, B, C, D, 13,  4, 0x289B7EC6);
-	P(D, A, B, C,  0, 11, 0xEAA127FA);
-	P(C, D, A, B,  3, 16, 0xD4EF3085);
-	P(B, C, D, A,  6, 23, 0x04881D05);
-	P(A, B, C, D,  9,  4, 0xD9D4D039);
-	P(D, A, B, C, 12, 11, 0xE6DB99E5);
-	P(C, D, A, B, 15, 16, 0x1FA27CF8);
-	P(B, C, D, A,  2, 23, 0xC4AC5665);
-
-#undef F
-#define F(x,y,z) (y ^ (x | ~z))
-
-	P(A, B, C, D,  0,  6, 0xF4292244);
-	P(D, A, B, C,  7, 10, 0x432AFF97);
-	P(C, D, A, B, 14, 15, 0xAB9423A7);
-	P(B, C, D, A,  5, 21, 0xFC93A039);
-	P(A, B, C, D, 12,  6, 0x655B59C3);
-	P(D, A, B, C,  3, 10, 0x8F0CCC92);
-	P(C, D, A, B, 10, 15, 0xFFEFF47D);
-	P(B, C, D, A,  1, 21, 0x85845DD1);
-	P(A, B, C, D,  8,  6, 0x6FA87E4F);
-	P(D, A, B, C, 15, 10, 0xFE2CE6E0);
-	P(C, D, A, B,  6, 15, 0xA3014314);
-	P(B, C, D, A, 13, 21, 0x4E0811A1);
-	P(A, B, C, D,  4,  6, 0xF7537E82);
-	P(D, A, B, C, 11, 10, 0xBD3AF235);
-	P(C, D, A, B,  2, 15, 0x2AD7D2BB);
-	P(B, C, D, A,  9, 21, 0xEB86D391);
-
-#undef F
-
-	ctx->A += A;
-	ctx->B += B;
-	ctx->C += C;
-	ctx->D += D;
-}
-
-void md5_update(md_context *ctx, const uint8_t *input, uint32_t length)
-{
-	uint32_t left, fill;
-
-	if (!length)
-		return;
-
-	left = ctx->totalN & 0x3F;
-	fill = CSUM_CHUNK - left;
-
-	ctx->totalN += length;
-	ctx->totalN &= 0xFFFFFFFF;
-
-	if (ctx->totalN < length)
-		ctx->totalN2++;
-
-	if (left && length >= fill) {
-		memcpy(ctx->buffer + left, input, fill);
-		md5_process(ctx, ctx->buffer);
-		length -= fill;
-		input  += fill;
-		left = 0;
-	}
-
-	while (length >= CSUM_CHUNK) {
-		md5_process(ctx, input);
-		length -= CSUM_CHUNK;
-		input  += CSUM_CHUNK;
-	}
-
-	if (length)
-		memcpy(ctx->buffer + left, input, length);
-}
-
-static uint8_t md5_padding[CSUM_CHUNK] = { 0x80 };
-
-void md5_result(md_context *ctx, uint8_t digest[MD5_DIGEST_LEN])
-{
-	uint32_t last, padn;
-	uint32_t high, low;
-	uint8_t msglen[8];
-
-	high = (ctx->totalN >> 29)
-	     | (ctx->totalN2 <<  3);
-	low  = (ctx->totalN <<  3);
-
-	SIVAL(msglen, 0, low);
-	SIVAL(msglen, 4, high);
-
-	last = ctx->totalN & 0x3F;
-	padn = last < 56 ? 56 - last : 120 - last;
-
-	md5_update(ctx, md5_padding, padn);
-	md5_update(ctx, msglen, 8);
-
-	SIVAL(digest, 0, ctx->A);
-	SIVAL(digest, 4, ctx->B);
-	SIVAL(digest, 8, ctx->C);
-	SIVAL(digest, 12, ctx->D);
-}
-
-void get_md5(uint8_t *out, const uint8_t *input, int n)
-{
-	md_context ctx;
-	md5_begin(&ctx);
-	md5_update(&ctx, input, n);
-	md5_result(&ctx, out);
-}
-
-#ifdef TEST_MD5
-
-#include <stdlib.h>
-#include <stdio.h>
-
-/*
- * those are the standard RFC 1321 test vectors
- */
-
-static struct {
-    char *str, *md5;
-} tests[] = {
- { "",
-   "d41d8cd98f00b204e9800998ecf8427e" },
- { "a",
-   "0cc175b9c0f1b6a831c399e269772661" },
- { "abc",
-   "900150983cd24fb0d6963f7d28e17f72" },
- { "message digest",
-   "f96b697d7cb7938d525a2f31aaf161d0" },
- { "abcdefghijklmnopqrstuvwxyz",
-   "c3fcd3d76192e4007dfb496cca67e13b" },
- { "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
-   "d174ab98d277d9f5a5611c2c9f419d9f" },
- { "12345678901234567890123456789012345678901234567890123456789012345678901234567890",
-   "57edf4a22be3c955ac49da2e2107b67a" },
- { NULL, NULL }
-};
-
-int main(int argc, char *argv[])
-{
-	FILE *f;
-	int i, j;
-	char output[33];
-	md_context ctx;
-	uint8_t buf[1000];
-	uint8_t md5sum[MD5_DIGEST_LEN];
-
-	if (argc < 2) {
-		printf("\nMD5 Validation Tests:\n\n");
-
-		for (i = 0; tests[i].str; i++) {
-			char *str = tests[i].str;
-			char *chk = tests[i].md5;
-
-			printf("  Test %d ", i + 1);
-
-			get_md5(md5sum, str, strlen(str));
-
-			for (j = 0; j < MD5_DIGEST_LEN; j++)
-				sprintf(output + j * 2, "%02x", md5sum[j]);
-
-			if (memcmp(output, chk, 32)) {
-				printf("failed!\n");
-				return 1;
-			}
-
-			printf("passed.\n");
-		}
-
-		printf("\n");
-		return 0;
-	}
-
-	while (--argc) {
-		if (!(f = fopen(*++argv, "rb"))) {
-			perror("fopen");
-			return 1;
-		}
-
-		md5_begin(&ctx);
-
-		while ((i = fread(buf, 1, sizeof buf, f)) > 0)
-			md5_update(&ctx, buf, i);
-
-		fclose(f);
-
-		md5_result(&ctx, md5sum);
-
-		for (j = 0; j < MD5_DIGEST_LEN; j++)
-			printf("%02x", md5sum[j]);
-
-		printf("  %s\n", *argv);
-	}
-
-	return 0;
-}
-
-#endif
diff --git a/contrib/md5/md5.h b/contrib/md5/md5.h
deleted file mode 100644
index ba8f08dbc..000000000
--- a/contrib/md5/md5.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* rsync-3.0.6/byteorder.h */
-
-/*
- * Simple byteorder handling.
- *
- * Copyright (C) 1992-1995 Andrew Tridgell
- * Copyright (C) 2007-2008 Wayne Davison
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, visit the http://fsf.org website.
- */
-
-#undef CAREFUL_ALIGNMENT
-
-/* We know that the x86 can handle misalignment and has the same
- * byte order (LSB-first) as the 32-bit numbers we transmit. */
-
-#ifdef __i386__
-#define CAREFUL_ALIGNMENT 0
-#endif
-
-#ifndef CAREFUL_ALIGNMENT
-#define CAREFUL_ALIGNMENT 1
-#endif
-
-#define CVAL(buf,pos) (((unsigned char *)(buf))[pos])
-#define UVAL(buf,pos) ((uint32_t)CVAL(buf,pos))
-#define SCVAL(buf,pos,val) (CVAL(buf,pos) = (val))
-
-#if CAREFUL_ALIGNMENT
-#define PVAL(buf,pos) (UVAL(buf,pos)|UVAL(buf,(pos)+1)<<8)
-#define IVAL(buf,pos) (PVAL(buf,pos)|PVAL(buf,(pos)+2)<<16)
-#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
-#define SIVALX(buf,pos,val) (SSVALX(buf,pos,val&0xFFFF),SSVALX(buf,pos+2,val>>16))
-#define SIVAL(buf,pos,val) SIVALX((buf),(pos),((uint32_t)(val)))
-#else
-
-/* this handles things for architectures like the 386 that can handle
-   alignment errors */
-
-/*
-   WARNING: This section is dependent on the length of int32
-   being correct. set CAREFUL_ALIGNMENT if it is not.
-*/
-
-#define IVAL(buf,pos) (*(uint32_t *)((char *)(buf) + (pos)))
-#define SIVAL(buf,pos,val) IVAL(buf,pos)=((uint32_t)(val))
-#endif
-
-/* The include file for both the MD4 and MD5 routines. */
-
-#define MD5_DIGEST_LEN 16
-#define MAX_DIGEST_LEN MD5_DIGEST_LEN
-
-#define CSUM_CHUNK 64
-
-typedef struct {
-	uint32_t A, B, C, D;
-	uint32_t totalN;          /* bit count, lower 32 bits */
-	uint32_t totalN2;         /* bit count, upper 32 bits */
-	uint8_t buffer[CSUM_CHUNK];
-} md_context;
-
-void md5_begin(md_context *ctx);
-void md5_update(md_context *ctx, const uint8_t *input, uint32_t length);
-void md5_result(md_context *ctx, uint8_t digest[MD5_DIGEST_LEN]);
-
-void get_md5(uint8_t digest[MD5_DIGEST_LEN], const uint8_t *input, int n);
diff --git a/contrib/qemu/block.c b/contrib/qemu/block.c
new file mode 100644
index 000000000..b56024113
--- /dev/null
+++ b/contrib/qemu/block.c
@@ -0,0 +1,4604 @@
+/*
+ * QEMU System Emulator block driver
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "config-host.h"
+#include "qemu-common.h"
+#include "trace.h"
+#include "monitor/monitor.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+#include "qemu/module.h"
+#include "qapi/qmp/qjson.h"
+#include "sysemu/sysemu.h"
+#include "qemu/notify.h"
+#include "block/coroutine.h"
+#include "qmp-commands.h"
+#include "qemu/timer.h"
+
+#ifdef CONFIG_BSD
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/queue.h>
+#ifndef __DragonFly__
+#include <sys/disk.h>
+#endif
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
+
+typedef enum {
+    BDRV_REQ_COPY_ON_READ = 0x1,
+    BDRV_REQ_ZERO_WRITE   = 0x2,
+} BdrvRequestFlags;
+
+static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
+static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque);
+static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque);
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags);
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags);
+static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                               int64_t sector_num,
+                                               QEMUIOVector *qiov,
+                                               int nb_sectors,
+                                               BlockDriverCompletionFunc *cb,
+                                               void *opaque,
+                                               bool is_write);
+static void coroutine_fn bdrv_co_do_rw(void *opaque);
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors);
+
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+        double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, int64_t *wait);
+
+static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
+    QTAILQ_HEAD_INITIALIZER(bdrv_states);
+
+static QLIST_HEAD(, BlockDriver) bdrv_drivers =
+    QLIST_HEAD_INITIALIZER(bdrv_drivers);
+
+/* If non-zero, use only whitelisted block drivers */
+static int use_bdrv_whitelist;
+
+#ifdef _WIN32
+static int is_windows_drive_prefix(const char *filename)
+{
+    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
+             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
+            filename[1] == ':');
+}
+
+int is_windows_drive(const char *filename)
+{
+    if (is_windows_drive_prefix(filename) &&
+        filename[2] == '\0')
+        return 1;
+    if (strstart(filename, "\\\\.\\", NULL) ||
+        strstart(filename, "//./", NULL))
+        return 1;
+    return 0;
+}
+#endif
+
+/* throttling disk I/O limits */
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+    bs->io_limits_enabled = false;
+
+    while (qemu_co_queue_next(&bs->throttled_reqs));
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+        bs->block_timer = NULL;
+    }
+
+    bs->slice_start = 0;
+    bs->slice_end   = 0;
+}
+
+static void bdrv_block_timer(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+
+    qemu_co_queue_next(&bs->throttled_reqs);
+}
+
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+    qemu_co_queue_init(&bs->throttled_reqs);
+    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
+    bs->io_limits_enabled = true;
+}
+
+bool bdrv_io_limits_enabled(BlockDriverState *bs)
+{
+    BlockIOLimit *io_limits = &bs->io_limits;
+    return io_limits->bps[BLOCK_IO_LIMIT_READ]
+         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
+         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
+         || io_limits->iops[BLOCK_IO_LIMIT_READ]
+         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
+         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
+}
+
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+                                     bool is_write, int nb_sectors)
+{
+    int64_t wait_time = -1;
+
+    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
+        qemu_co_queue_wait(&bs->throttled_reqs);
+    }
+
+    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
+     * throttled requests will not be dequeued until the current request is
+     * allowed to be serviced. So if the current request still exceeds the
+     * limits, it will be inserted to the head. All requests followed it will
+     * be still in throttled_reqs queue.
+     */
+
+    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
+        qemu_mod_timer(bs->block_timer,
+                       wait_time + qemu_get_clock_ns(vm_clock));
+        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
+    }
+
+    qemu_co_queue_next(&bs->throttled_reqs);
+}
+
+/* check if the path starts with "<protocol>:" */
+static int path_has_protocol(const char *path)
+{
+    const char *p;
+
+#ifdef _WIN32
+    if (is_windows_drive(path) ||
+        is_windows_drive_prefix(path)) {
+        return 0;
+    }
+    p = path + strcspn(path, ":/\\");
+#else
+    p = path + strcspn(path, ":/");
+#endif
+
+    return *p == ':';
+}
+
+int path_is_absolute(const char *path)
+{
+#ifdef _WIN32
+    /* specific case for names like: "\\.\d:" */
+    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
+        return 1;
+    }
+    return (*path == '/' || *path == '\\');
+#else
+    return (*path == '/');
+#endif
+}
+
+/* if filename is absolute, just copy it to dest. Otherwise, build a
+   path to it by considering it is relative to base_path. URL are
+   supported. */
+void path_combine(char *dest, int dest_size,
+                  const char *base_path,
+                  const char *filename)
+{
+    const char *p, *p1;
+    int len;
+
+    if (dest_size <= 0)
+        return;
+    if (path_is_absolute(filename)) {
+        pstrcpy(dest, dest_size, filename);
+    } else {
+        p = strchr(base_path, ':');
+        if (p)
+            p++;
+        else
+            p = base_path;
+        p1 = strrchr(base_path, '/');
+#ifdef _WIN32
+        {
+            const char *p2;
+            p2 = strrchr(base_path, '\\');
+            if (!p1 || p2 > p1)
+                p1 = p2;
+        }
+#endif
+        if (p1)
+            p1++;
+        else
+            p1 = base_path;
+        if (p1 > p)
+            p = p1;
+        len = p - base_path;
+        if (len > dest_size - 1)
+            len = dest_size - 1;
+        memcpy(dest, base_path, len);
+        dest[len] = '\0';
+        pstrcat(dest, dest_size, filename);
+    }
+}
+
+void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
+{
+    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
+        pstrcpy(dest, sz, bs->backing_file);
+    } else {
+        path_combine(dest, sz, bs->filename, bs->backing_file);
+    }
+}
+
+void bdrv_register(BlockDriver *bdrv)
+{
+    /* Block drivers without coroutine functions need emulation */
+    if (!bdrv->bdrv_co_readv) {
+        bdrv->bdrv_co_readv = bdrv_co_readv_em;
+        bdrv->bdrv_co_writev = bdrv_co_writev_em;
+
+        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
+         * the block driver lacks aio we need to emulate that too.
+         */
+        if (!bdrv->bdrv_aio_readv) {
+            /* add AIO emulation layer */
+            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
+            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
+        }
+    }
+
+    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
+}
+
+/* create a new block device (by default it is empty) */
+BlockDriverState *bdrv_new(const char *device_name)
+{
+    BlockDriverState *bs;
+
+    bs = g_malloc0(sizeof(BlockDriverState));
+    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
+    if (device_name[0] != '\0') {
+        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
+    }
+    bdrv_iostatus_disable(bs);
+    notifier_list_init(&bs->close_notifiers);
+    notifier_with_return_list_init(&bs->before_write_notifiers);
+
+    return bs;
+}
+
+void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
+{
+    notifier_list_add(&bs->close_notifiers, notify);
+}
+
+BlockDriver *bdrv_find_format(const char *format_name)
+{
+    BlockDriver *drv1;
+    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
+        if (!strcmp(drv1->format_name, format_name)) {
+            return drv1;
+        }
+    }
+    return NULL;
+}
+
+static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
+{
+    static const char *whitelist_rw[] = {
+        CONFIG_BDRV_RW_WHITELIST
+    };
+    static const char *whitelist_ro[] = {
+        CONFIG_BDRV_RO_WHITELIST
+    };
+    const char **p;
+
+    if (!whitelist_rw[0] && !whitelist_ro[0]) {
+        return 1;               /* no whitelist, anything goes */
+    }
+
+    for (p = whitelist_rw; *p; p++) {
+        if (!strcmp(drv->format_name, *p)) {
+            return 1;
+        }
+    }
+    if (read_only) {
+        for (p = whitelist_ro; *p; p++) {
+            if (!strcmp(drv->format_name, *p)) {
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
+                                          bool read_only)
+{
+    BlockDriver *drv = bdrv_find_format(format_name);
+    return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
+}
+
+typedef struct CreateCo {
+    BlockDriver *drv;
+    char *filename;
+    QEMUOptionParameter *options;
+    int ret;
+} CreateCo;
+
+static void coroutine_fn bdrv_create_co_entry(void *opaque)
+{
+    CreateCo *cco = opaque;
+    assert(cco->drv);
+
+    cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
+}
+
+int bdrv_create(BlockDriver *drv, const char* filename,
+    QEMUOptionParameter *options)
+{
+    int ret;
+
+    Coroutine *co;
+    CreateCo cco = {
+        .drv = drv,
+        .filename = g_strdup(filename),
+        .options = options,
+        .ret = NOT_DONE,
+    };
+
+    if (!drv->bdrv_create) {
+        ret = -ENOTSUP;
+        goto out;
+    }
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_create_co_entry(&cco);
+    } else {
+        co = qemu_coroutine_create(bdrv_create_co_entry);
+        qemu_coroutine_enter(co, &cco);
+        while (cco.ret == NOT_DONE) {
+            qemu_aio_wait();
+        }
+    }
+
+    ret = cco.ret;
+
+out:
+    g_free(cco.filename);
+    return ret;
+}
+
+int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
+{
+    BlockDriver *drv;
+
+    drv = bdrv_find_protocol(filename, true);
+    if (drv == NULL) {
+        return -ENOENT;
+    }
+
+    return bdrv_create(drv, filename, options);
+}
+
+/*
+ * Create a uniquely-named empty temporary file.
+ * Return 0 upon success, otherwise a negative errno value.
+ */
+int get_tmp_filename(char *filename, int size)
+{
+#ifdef _WIN32
+    char temp_dir[MAX_PATH];
+    /* GetTempFileName requires that its output buffer (4th param)
+       have length MAX_PATH or greater.  */
+    assert(size >= MAX_PATH);
+    return (GetTempPath(MAX_PATH, temp_dir)
+            && GetTempFileName(temp_dir, "qem", 0, filename)
+            ? 0 : -GetLastError());
+#else
+    int fd;
+    const char *tmpdir;
+    tmpdir = getenv("TMPDIR");
+    if (!tmpdir)
+        tmpdir = "/tmp";
+    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
+        return -EOVERFLOW;
+    }
+    fd = mkstemp(filename);
+    if (fd < 0) {
+        return -errno;
+    }
+    if (close(fd) != 0) {
+        unlink(filename);
+        return -errno;
+    }
+    return 0;
+#endif
+}
+
+/*
+ * Detect host devices. By convention, /dev/cdrom[N] is always
+ * recognized as a host CDROM.
+ */
+static BlockDriver *find_hdev_driver(const char *filename)
+{
+    int score_max = 0, score;
+    BlockDriver *drv = NULL, *d;
+
+    QLIST_FOREACH(d, &bdrv_drivers, list) {
+        if (d->bdrv_probe_device) {
+            score = d->bdrv_probe_device(filename);
+            if (score > score_max) {
+                score_max = score;
+                drv = d;
+            }
+        }
+    }
+
+    return drv;
+}
+
+BlockDriver *bdrv_find_protocol(const char *filename,
+                                bool allow_protocol_prefix)
+{
+    BlockDriver *drv1;
+    char protocol[128];
+    int len;
+    const char *p;
+
+    /* TODO Drivers without bdrv_file_open must be specified explicitly */
+
+    /*
+     * XXX(hch): we really should not let host device detection
+     * override an explicit protocol specification, but moving this
+     * later breaks access to device names with colons in them.
+     * Thanks to the brain-dead persistent naming schemes on udev-
+     * based Linux systems those actually are quite common.
+     */
+    drv1 = find_hdev_driver(filename);
+    if (drv1) {
+        return drv1;
+    }
+
+    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
+        return bdrv_find_format("file");
+    }
+
+    p = strchr(filename, ':');
+    assert(p != NULL);
+    len = p - filename;
+    if (len > sizeof(protocol) - 1)
+        len = sizeof(protocol) - 1;
+    memcpy(protocol, filename, len);
+    protocol[len] = '\0';
+    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
+        if (drv1->protocol_name &&
+            !strcmp(drv1->protocol_name, protocol)) {
+            return drv1;
+        }
+    }
+    return NULL;
+}
+
+static int find_image_format(BlockDriverState *bs, const char *filename,
+                             BlockDriver **pdrv)
+{
+    int score, score_max;
+    BlockDriver *drv1, *drv;
+    uint8_t buf[2048];
+    int ret = 0;
+
+    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
+    if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
+        drv = bdrv_find_format("raw");
+        if (!drv) {
+            ret = -ENOENT;
+        }
+        *pdrv = drv;
+        return ret;
+    }
+
+    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
+    if (ret < 0) {
+        *pdrv = NULL;
+        return ret;
+    }
+
+    score_max = 0;
+    drv = NULL;
+    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
+        if (drv1->bdrv_probe) {
+            score = drv1->bdrv_probe(buf, ret, filename);
+            if (score > score_max) {
+                score_max = score;
+                drv = drv1;
+            }
+        }
+    }
+    if (!drv) {
+        ret = -ENOENT;
+    }
+    *pdrv = drv;
+    return ret;
+}
+
+/**
+ * Set the current 'total_sectors' value
+ */
+static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
+{
+    BlockDriver *drv = bs->drv;
+
+    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
+    if (bs->sg)
+        return 0;
+
+    /* query actual device if possible, otherwise just trust the hint */
+    if (drv->bdrv_getlength) {
+        int64_t length = drv->bdrv_getlength(bs);
+        if (length < 0) {
+            return length;
+        }
+        hint = length >> BDRV_SECTOR_BITS;
+    }
+
+    bs->total_sectors = hint;
+    return 0;
+}
+
+/**
+ * Set open flags for a given discard mode
+ *
+ * Return 0 on success, -1 if the discard mode was invalid.
+ */
+int bdrv_parse_discard_flags(const char *mode, int *flags)
+{
+    *flags &= ~BDRV_O_UNMAP;
+
+    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
+        /* do nothing */
+    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
+        *flags |= BDRV_O_UNMAP;
+    } else {
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * Set open flags for a given cache mode
+ *
+ * Return 0 on success, -1 if the cache mode was invalid.
+ */
+int bdrv_parse_cache_flags(const char *mode, int *flags)
+{
+    *flags &= ~BDRV_O_CACHE_MASK;
+
+    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
+        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
+    } else if (!strcmp(mode, "directsync")) {
+        *flags |= BDRV_O_NOCACHE;
+    } else if (!strcmp(mode, "writeback")) {
+        *flags |= BDRV_O_CACHE_WB;
+    } else if (!strcmp(mode, "unsafe")) {
+        *flags |= BDRV_O_CACHE_WB;
+        *flags |= BDRV_O_NO_FLUSH;
+    } else if (!strcmp(mode, "writethrough")) {
+        /* this is the default */
+    } else {
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * The copy-on-read flag is actually a reference count so multiple users may
+ * use the feature without worrying about clobbering its previous state.
+ * Copy-on-read stays enabled until all users have called to disable it.
+ */
+void bdrv_enable_copy_on_read(BlockDriverState *bs)
+{
+    bs->copy_on_read++;
+}
+
+void bdrv_disable_copy_on_read(BlockDriverState *bs)
+{
+    assert(bs->copy_on_read > 0);
+    bs->copy_on_read--;
+}
+
+static int bdrv_open_flags(BlockDriverState *bs, int flags)
+{
+    int open_flags = flags | BDRV_O_CACHE_WB;
+
+    /*
+     * Clear flags that are internal to the block layer before opening the
+     * image.
+     */
+    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
+
+    /*
+     * Snapshots should be writable.
+     */
+    if (bs->is_temporary) {
+        open_flags |= BDRV_O_RDWR;
+    }
+
+    return open_flags;
+}
+
+/*
+ * Common part for opening disk images and files
+ *
+ * Removes all processed options from *options.
+ */
+static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
+    QDict *options, int flags, BlockDriver *drv)
+{
+    int ret, open_flags;
+    const char *filename;
+
+    assert(drv != NULL);
+    assert(bs->file == NULL);
+    assert(options != NULL && bs->options != options);
+
+    if (file != NULL) {
+        filename = file->filename;
+    } else {
+        filename = qdict_get_try_str(options, "filename");
+    }
+
+    trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
+
+    /* bdrv_open() with directly using a protocol as drv. This layer is already
+     * opened, so assign it to bs (while file becomes a closed BlockDriverState)
+     * and return immediately. */
+    if (file != NULL && drv->bdrv_file_open) {
+        bdrv_swap(file, bs);
+        return 0;
+    }
+
+    bs->open_flags = flags;
+    bs->buffer_alignment = 512;
+    open_flags = bdrv_open_flags(bs, flags);
+    bs->read_only = !(open_flags & BDRV_O_RDWR);
+
+    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
+        return -ENOTSUP;
+    }
+
+    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
+    if (!bs->read_only && (flags & BDRV_O_COPY_ON_READ)) {
+        bdrv_enable_copy_on_read(bs);
+    }
+
+    if (filename != NULL) {
+        pstrcpy(bs->filename, sizeof(bs->filename), filename);
+    } else {
+        bs->filename[0] = '\0';
+    }
+
+    bs->drv = drv;
+    bs->opaque = g_malloc0(drv->instance_size);
+
+    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
+
+    /* Open the image, either directly or using a protocol */
+    if (drv->bdrv_file_open) {
+        assert(file == NULL);
+        assert(drv->bdrv_parse_filename || filename != NULL);
+        ret = drv->bdrv_file_open(bs, options, open_flags);
+    } else {
+        if (file == NULL) {
+            qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't use '%s' as a "
+                          "block driver for the protocol level",
+                          drv->format_name);
+            ret = -EINVAL;
+            goto free_and_fail;
+        }
+        assert(file != NULL);
+        bs->file = file;
+        ret = drv->bdrv_open(bs, options, open_flags);
+    }
+
+    if (ret < 0) {
+        goto free_and_fail;
+    }
+
+    ret = refresh_total_sectors(bs, bs->total_sectors);
+    if (ret < 0) {
+        goto free_and_fail;
+    }
+
+#ifndef _WIN32
+    if (bs->is_temporary) {
+        assert(filename != NULL);
+        unlink(filename);
+    }
+#endif
+    return 0;
+
+free_and_fail:
+    bs->file = NULL;
+    g_free(bs->opaque);
+    bs->opaque = NULL;
+    bs->drv = NULL;
+    return ret;
+}
+
+/*
+ * Opens a file using a protocol (file, host_device, nbd, ...)
+ *
+ * options is a QDict of options to pass to the block drivers, or NULL for an
+ * empty set of options. The reference to the QDict belongs to the block layer
+ * after the call (even on failure), so if the caller intends to reuse the
+ * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
+ */
+int bdrv_file_open(BlockDriverState **pbs, const char *filename,
+                   QDict *options, int flags)
+{
+    BlockDriverState *bs;
+    BlockDriver *drv;
+    const char *drvname;
+    bool allow_protocol_prefix = false;
+    int ret;
+
+    /* NULL means an empty set of options */
+    if (options == NULL) {
+        options = qdict_new();
+    }
+
+    bs = bdrv_new("");
+    bs->options = options;
+    options = qdict_clone_shallow(options);
+
+    /* Fetch the file name from the options QDict if necessary */
+    if (!filename) {
+        filename = qdict_get_try_str(options, "filename");
+    } else if (filename && !qdict_haskey(options, "filename")) {
+        qdict_put(options, "filename", qstring_from_str(filename));
+        allow_protocol_prefix = true;
+    } else {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't specify 'file' and "
+                      "'filename' options at the same time");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Find the right block driver */
+    drvname = qdict_get_try_str(options, "driver");
+    if (drvname) {
+        drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR));
+        qdict_del(options, "driver");
+    } else if (filename) {
+        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
+        if (!drv) {
+            qerror_report(ERROR_CLASS_GENERIC_ERROR, "Unknown protocol");
+        }
+    } else {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "Must specify either driver or file");
+        drv = NULL;
+    }
+
+    if (!drv) {
+        ret = -ENOENT;
+        goto fail;
+    }
+
+    /* Parse the filename and open it */
+    if (drv->bdrv_parse_filename && filename) {
+        Error *local_err = NULL;
+        drv->bdrv_parse_filename(filename, options, &local_err);
+        if (error_is_set(&local_err)) {
+            qerror_report_err(local_err);
+            error_free(local_err);
+            ret = -EINVAL;
+            goto fail;
+        }
+        qdict_del(options, "filename");
+    } else if (!drv->bdrv_parse_filename && !filename) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "The '%s' block driver requires a file name",
+                      drv->format_name);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    ret = bdrv_open_common(bs, NULL, options, flags, drv);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Check if any unknown options were used */
+    if (qdict_size(options) != 0) {
+        const QDictEntry *entry = qdict_first(options);
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block protocol '%s' doesn't "
+                      "support the option '%s'",
+                      drv->format_name, entry->key);
+        ret = -EINVAL;
+        goto fail;
+    }
+    QDECREF(options);
+
+    bs->growable = 1;
+    *pbs = bs;
+    return 0;
+
+fail:
+    QDECREF(options);
+    if (!bs->drv) {
+        QDECREF(bs->options);
+    }
+    bdrv_delete(bs);
+    return ret;
+}
+
+/*
+ * Opens the backing file for a BlockDriverState if not yet open
+ *
+ * options is a QDict of options to pass to the block drivers, or NULL for an
+ * empty set of options. The reference to the QDict is transferred to this
+ * function (even on failure), so if the caller intends to reuse the dictionary,
+ * it needs to use QINCREF() before calling bdrv_file_open.
+ */
+int bdrv_open_backing_file(BlockDriverState *bs, QDict *options)
+{
+    char backing_filename[PATH_MAX];
+    int back_flags, ret;
+    BlockDriver *back_drv = NULL;
+
+    if (bs->backing_hd != NULL) {
+        QDECREF(options);
+        return 0;
+    }
+
+    /* NULL means an empty set of options */
+    if (options == NULL) {
+        options = qdict_new();
+    }
+
+    bs->open_flags &= ~BDRV_O_NO_BACKING;
+    if (qdict_haskey(options, "file.filename")) {
+        backing_filename[0] = '\0';
+    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
+        QDECREF(options);
+        return 0;
+    }
+
+    bs->backing_hd = bdrv_new("");
+    bdrv_get_full_backing_filename(bs, backing_filename,
+                                   sizeof(backing_filename));
+
+    if (bs->backing_format[0] != '\0') {
+        back_drv = bdrv_find_format(bs->backing_format);
+    }
+
+    /* backing files always opened read-only */
+    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
+
+    ret = bdrv_open(bs->backing_hd,
+                    *backing_filename ? backing_filename : NULL, options,
+                    back_flags, back_drv);
+    if (ret < 0) {
+        bdrv_delete(bs->backing_hd);
+        bs->backing_hd = NULL;
+        bs->open_flags |= BDRV_O_NO_BACKING;
+        return ret;
+    }
+    return 0;
+}
+
+static void extract_subqdict(QDict *src, QDict **dst, const char *start)
+{
+    const QDictEntry *entry, *next;
+    const char *p;
+
+    *dst = qdict_new();
+    entry = qdict_first(src);
+
+    while (entry != NULL) {
+        next = qdict_next(src, entry);
+        if (strstart(entry->key, start, &p)) {
+            qobject_incref(entry->value);
+            qdict_put_obj(*dst, p, entry->value);
+            qdict_del(src, entry->key);
+        }
+        entry = next;
+    }
+}
+
+/*
+ * Opens a disk image (raw, qcow2, vmdk, ...)
+ *
+ * options is a QDict of options to pass to the block drivers, or NULL for an
+ * empty set of options. The reference to the QDict belongs to the block layer
+ * after the call (even on failure), so if the caller intends to reuse the
+ * dictionary, it needs to use QINCREF() before calling bdrv_open.
+ */
+int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
+              int flags, BlockDriver *drv)
+{
+    int ret;
+    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
+    char tmp_filename[PATH_MAX + 1];
+    BlockDriverState *file = NULL;
+    QDict *file_options = NULL;
+
+    /* NULL means an empty set of options */
+    if (options == NULL) {
+        options = qdict_new();
+    }
+
+    bs->options = options;
+    options = qdict_clone_shallow(options);
+
+    /* For snapshot=on, create a temporary qcow2 overlay */
+    if (flags & BDRV_O_SNAPSHOT) {
+        BlockDriverState *bs1;
+        int64_t total_size;
+        BlockDriver *bdrv_qcow2;
+        QEMUOptionParameter *create_options;
+        char backing_filename[PATH_MAX];
+
+        if (qdict_size(options) != 0) {
+            error_report("Can't use snapshot=on with driver-specific options");
+            ret = -EINVAL;
+            goto fail;
+        }
+        assert(filename != NULL);
+
+        /* if snapshot, we create a temporary backing file and open it
+           instead of opening 'filename' directly */
+
+        /* if there is a backing file, use it */
+        bs1 = bdrv_new("");
+        ret = bdrv_open(bs1, filename, NULL, 0, drv);
+        if (ret < 0) {
+            bdrv_delete(bs1);
+            goto fail;
+        }
+        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
+
+        bdrv_delete(bs1);
+
+        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
+        if (ret < 0) {
+            goto fail;
+        }
+
+        /* Real path is meaningless for protocols */
+        if (path_has_protocol(filename)) {
+            snprintf(backing_filename, sizeof(backing_filename),
+                     "%s", filename);
+        } else if (!realpath(filename, backing_filename)) {
+            ret = -errno;
+            goto fail;
+        }
+
+        bdrv_qcow2 = bdrv_find_format("qcow2");
+        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
+                                                 NULL);
+
+        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
+        set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
+                             backing_filename);
+        if (drv) {
+            set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
+                drv->format_name);
+        }
+
+        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options);
+        free_option_parameters(create_options);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        filename = tmp_filename;
+        drv = bdrv_qcow2;
+        bs->is_temporary = 1;
+    }
+
+    /* Open image file without format layer */
+    if (flags & BDRV_O_RDWR) {
+        flags |= BDRV_O_ALLOW_RDWR;
+    }
+
+    extract_subqdict(options, &file_options, "file.");
+
+    ret = bdrv_file_open(&file, filename, file_options,
+                         bdrv_open_flags(bs, flags | BDRV_O_UNMAP));
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Find the right image format driver */
+    if (!drv) {
+        ret = find_image_format(file, filename, &drv);
+    }
+
+    if (!drv) {
+        goto unlink_and_fail;
+    }
+
+    /* Open the image */
+    ret = bdrv_open_common(bs, file, options, flags, drv);
+    if (ret < 0) {
+        goto unlink_and_fail;
+    }
+
+    if (bs->file != file) {
+        bdrv_delete(file);
+        file = NULL;
+    }
+
+    /* If there is a backing file, use it */
+    if ((flags & BDRV_O_NO_BACKING) == 0) {
+        QDict *backing_options;
+
+        extract_subqdict(options, &backing_options, "backing.");
+        ret = bdrv_open_backing_file(bs, backing_options);
+        if (ret < 0) {
+            goto close_and_fail;
+        }
+    }
+
+    /* Check if any unknown options were used */
+    if (qdict_size(options) != 0) {
+        const QDictEntry *entry = qdict_first(options);
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by "
+            "device '%s' doesn't support the option '%s'",
+            drv->format_name, bs->device_name, entry->key);
+
+        ret = -EINVAL;
+        goto close_and_fail;
+    }
+    QDECREF(options);
+
+    if (!bdrv_key_required(bs)) {
+        bdrv_dev_change_media_cb(bs, true);
+    }
+
+    /* throttling disk I/O limits */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_enable(bs);
+    }
+
+    return 0;
+
+unlink_and_fail:
+    if (file != NULL) {
+        bdrv_delete(file);
+    }
+    if (bs->is_temporary) {
+        unlink(filename);
+    }
+fail:
+    QDECREF(bs->options);
+    QDECREF(options);
+    bs->options = NULL;
+    return ret;
+
+close_and_fail:
+    bdrv_close(bs);
+    QDECREF(options);
+    return ret;
+}
+
+typedef struct BlockReopenQueueEntry {
+     bool prepared;
+     BDRVReopenState state;
+     QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
+} BlockReopenQueueEntry;
+
+/*
+ * Adds a BlockDriverState to a simple queue for an atomic, transactional
+ * reopen of multiple devices.
+ *
+ * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
+ * already performed, or alternatively may be NULL a new BlockReopenQueue will
+ * be created and initialized. This newly created BlockReopenQueue should be
+ * passed back in for subsequent calls that are intended to be of the same
+ * atomic 'set'.
+ *
+ * bs is the BlockDriverState to add to the reopen queue.
+ *
+ * flags contains the open flags for the associated bs
+ *
+ * returns a pointer to bs_queue, which is either the newly allocated
+ * bs_queue, or the existing bs_queue being used.
+ *
+ */
+BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
+                                    BlockDriverState *bs, int flags)
+{
+    assert(bs != NULL);
+
+    BlockReopenQueueEntry *bs_entry;
+    if (bs_queue == NULL) {
+        bs_queue = g_new0(BlockReopenQueue, 1);
+        QSIMPLEQ_INIT(bs_queue);
+    }
+
+    if (bs->file) {
+        bdrv_reopen_queue(bs_queue, bs->file, flags);
+    }
+
+    bs_entry = g_new0(BlockReopenQueueEntry, 1);
+    QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
+
+    bs_entry->state.bs = bs;
+    bs_entry->state.flags = flags;
+
+    return bs_queue;
+}
+
+/*
+ * Reopen multiple BlockDriverStates atomically & transactionally.
+ *
+ * The queue passed in (bs_queue) must have been built up previous
+ * via bdrv_reopen_queue().
+ *
+ * Reopens all BDS specified in the queue, with the appropriate
+ * flags.  All devices are prepared for reopen, and failure of any
+ * device will cause all device changes to be abandonded, and intermediate
+ * data cleaned up.
+ *
+ * If all devices prepare successfully, then the changes are committed
+ * to all devices.
+ *
+ */
+int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
+{
+    int ret = -1;
+    BlockReopenQueueEntry *bs_entry, *next;
+    Error *local_err = NULL;
+
+    assert(bs_queue != NULL);
+
+    bdrv_drain_all();
+
+    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
+        if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
+            error_propagate(errp, local_err);
+            goto cleanup;
+        }
+        bs_entry->prepared = true;
+    }
+
+    /* If we reach this point, we have success and just need to apply the
+     * changes
+     */
+    QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
+        bdrv_reopen_commit(&bs_entry->state);
+    }
+
+    ret = 0;
+
+cleanup:
+    QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
+        if (ret && bs_entry->prepared) {
+            bdrv_reopen_abort(&bs_entry->state);
+        }
+        g_free(bs_entry);
+    }
+    g_free(bs_queue);
+    return ret;
+}
+
+
+/* Reopen a single BlockDriverState with the specified flags. */
+int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
+{
+    int ret = -1;
+    Error *local_err = NULL;
+    BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
+
+    ret = bdrv_reopen_multiple(queue, &local_err);
+    if (local_err != NULL) {
+        error_propagate(errp, local_err);
+    }
+    return ret;
+}
+
+
+/*
+ * Prepares a BlockDriverState for reopen. All changes are staged in the
+ * 'opaque' field of the BDRVReopenState, which is used and allocated by
+ * the block driver layer .bdrv_reopen_prepare()
+ *
+ * bs is the BlockDriverState to reopen
+ * flags are the new open flags
+ * queue is the reopen queue
+ *
+ * Returns 0 on success, non-zero on error.  On error errp will be set
+ * as well.
+ *
+ * On failure, bdrv_reopen_abort() will be called to clean up any data.
+ * It is the responsibility of the caller to then call the abort() or
+ * commit() for any other BDS that have been left in a prepare() state
+ *
+ */
+int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
+                        Error **errp)
+{
+    int ret = -1;
+    Error *local_err = NULL;
+    BlockDriver *drv;
+
+    assert(reopen_state != NULL);
+    assert(reopen_state->bs->drv != NULL);
+    drv = reopen_state->bs->drv;
+
+    /* if we are to stay read-only, do not allow permission change
+     * to r/w */
+    if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
+        reopen_state->flags & BDRV_O_RDWR) {
+        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
+                  reopen_state->bs->device_name);
+        goto error;
+    }
+
+
+    ret = bdrv_flush(reopen_state->bs);
+    if (ret) {
+        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
+                  strerror(-ret));
+        goto error;
+    }
+
+    if (drv->bdrv_reopen_prepare) {
+        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
+        if (ret) {
+            if (local_err != NULL) {
+                error_propagate(errp, local_err);
+            } else {
+                error_setg(errp, "failed while preparing to reopen image '%s'",
+                           reopen_state->bs->filename);
+            }
+            goto error;
+        }
+    } else {
+        /* It is currently mandatory to have a bdrv_reopen_prepare()
+         * handler for each supported drv. */
+        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+                  drv->format_name, reopen_state->bs->device_name,
+                 "reopening of file");
+        ret = -1;
+        goto error;
+    }
+
+    ret = 0;
+
+error:
+    return ret;
+}
+
+/*
+ * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
+ * makes them final by swapping the staging BlockDriverState contents into
+ * the active BlockDriverState contents.
+ */
+void bdrv_reopen_commit(BDRVReopenState *reopen_state)
+{
+    BlockDriver *drv;
+
+    assert(reopen_state != NULL);
+    drv = reopen_state->bs->drv;
+    assert(drv != NULL);
+
+    /* If there are any driver level actions to take */
+    if (drv->bdrv_reopen_commit) {
+        drv->bdrv_reopen_commit(reopen_state);
+    }
+
+    /* set BDS specific flags now */
+    reopen_state->bs->open_flags         = reopen_state->flags;
+    reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
+                                              BDRV_O_CACHE_WB);
+    reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
+}
+
+/*
+ * Abort the reopen, and delete and free the staged changes in
+ * reopen_state
+ */
+void bdrv_reopen_abort(BDRVReopenState *reopen_state)
+{
+    BlockDriver *drv;
+
+    assert(reopen_state != NULL);
+    drv = reopen_state->bs->drv;
+    assert(drv != NULL);
+
+    if (drv->bdrv_reopen_abort) {
+        drv->bdrv_reopen_abort(reopen_state);
+    }
+}
+
+
+void bdrv_close(BlockDriverState *bs)
+{
+    if (bs->job) {
+        block_job_cancel_sync(bs->job);
+    }
+    bdrv_drain_all(); /* complete I/O */
+    bdrv_flush(bs);
+    bdrv_drain_all(); /* in case flush left pending I/O */
+    notifier_list_notify(&bs->close_notifiers, bs);
+
+    if (bs->drv) {
+        if (bs->backing_hd) {
+            bdrv_delete(bs->backing_hd);
+            bs->backing_hd = NULL;
+        }
+        bs->drv->bdrv_close(bs);
+        g_free(bs->opaque);
+#ifdef _WIN32
+        if (bs->is_temporary) {
+            unlink(bs->filename);
+        }
+#endif
+        bs->opaque = NULL;
+        bs->drv = NULL;
+        bs->copy_on_read = 0;
+        bs->backing_file[0] = '\0';
+        bs->backing_format[0] = '\0';
+        bs->total_sectors = 0;
+        bs->encrypted = 0;
+        bs->valid_key = 0;
+        bs->sg = 0;
+        bs->growable = 0;
+        QDECREF(bs->options);
+        bs->options = NULL;
+
+        if (bs->file != NULL) {
+            bdrv_delete(bs->file);
+            bs->file = NULL;
+        }
+    }
+
+    bdrv_dev_change_media_cb(bs, false);
+
+    /*throttling disk I/O limits*/
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_disable(bs);
+    }
+}
+
+void bdrv_close_all(void)
+{
+    BlockDriverState *bs;
+
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        bdrv_close(bs);
+    }
+}
+
+/*
+ * Wait for pending requests to complete across all BlockDriverStates
+ *
+ * This function does not flush data to disk, use bdrv_flush_all() for that
+ * after calling this function.
+ *
+ * Note that completion of an asynchronous I/O operation can trigger any
+ * number of other I/O operations on other devices---for example a coroutine
+ * can be arbitrarily complex and a constant flow of I/O can come until the
+ * coroutine is complete.  Because of this, it is not possible to have a
+ * function to drain a single device's I/O queue.
+ */
+void bdrv_drain_all(void)
+{
+    BlockDriverState *bs;
+    bool busy;
+
+    do {
+        busy = qemu_aio_wait();
+
+        /* FIXME: We do not have timer support here, so this is effectively
+         * a busy wait.
+         */
+        QTAILQ_FOREACH(bs, &bdrv_states, list) {
+            if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
+                qemu_co_queue_restart_all(&bs->throttled_reqs);
+                busy = true;
+            }
+        }
+    } while (busy);
+
+    /* If requests are still pending there is a bug somewhere */
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        assert(QLIST_EMPTY(&bs->tracked_requests));
+        assert(qemu_co_queue_empty(&bs->throttled_reqs));
+    }
+}
+
+/* make a BlockDriverState anonymous by removing from bdrv_state list.
+   Also, NULL terminate the device_name to prevent double remove */
+void bdrv_make_anon(BlockDriverState *bs)
+{
+    if (bs->device_name[0] != '\0') {
+        QTAILQ_REMOVE(&bdrv_states, bs, list);
+    }
+    bs->device_name[0] = '\0';
+}
+
+static void bdrv_rebind(BlockDriverState *bs)
+{
+    if (bs->drv && bs->drv->bdrv_rebind) {
+        bs->drv->bdrv_rebind(bs);
+    }
+}
+
+static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
+                                     BlockDriverState *bs_src)
+{
+    /* move some fields that need to stay attached to the device */
+    bs_dest->open_flags         = bs_src->open_flags;
+
+    /* dev info */
+    bs_dest->dev_ops            = bs_src->dev_ops;
+    bs_dest->dev_opaque         = bs_src->dev_opaque;
+    bs_dest->dev                = bs_src->dev;
+    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
+    bs_dest->copy_on_read       = bs_src->copy_on_read;
+
+    bs_dest->enable_write_cache = bs_src->enable_write_cache;
+
+    /* i/o timing parameters */
+    bs_dest->slice_start        = bs_src->slice_start;
+    bs_dest->slice_end          = bs_src->slice_end;
+    bs_dest->slice_submitted    = bs_src->slice_submitted;
+    bs_dest->io_limits          = bs_src->io_limits;
+    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
+    bs_dest->block_timer        = bs_src->block_timer;
+    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
+
+    /* r/w error */
+    bs_dest->on_read_error      = bs_src->on_read_error;
+    bs_dest->on_write_error     = bs_src->on_write_error;
+
+    /* i/o status */
+    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
+    bs_dest->iostatus           = bs_src->iostatus;
+
+    /* dirty bitmap */
+    bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
+
+    /* job */
+    bs_dest->in_use             = bs_src->in_use;
+    bs_dest->job                = bs_src->job;
+
+    /* keep the same entry in bdrv_states */
+    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
+            bs_src->device_name);
+    bs_dest->list = bs_src->list;
+}
+
+/*
+ * Swap bs contents for two image chains while they are live,
+ * while keeping required fields on the BlockDriverState that is
+ * actually attached to a device.
+ *
+ * This will modify the BlockDriverState fields, and swap contents
+ * between bs_new and bs_old. Both bs_new and bs_old are modified.
+ *
+ * bs_new is required to be anonymous.
+ *
+ * This function does not create any image files.
+ */
+void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
+{
+    BlockDriverState tmp;
+
+    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
+    assert(bs_new->device_name[0] == '\0');
+    assert(bs_new->dirty_bitmap == NULL);
+    assert(bs_new->job == NULL);
+    assert(bs_new->dev == NULL);
+    assert(bs_new->in_use == 0);
+    assert(bs_new->io_limits_enabled == false);
+    assert(bs_new->block_timer == NULL);
+
+    tmp = *bs_new;
+    *bs_new = *bs_old;
+    *bs_old = tmp;
+
+    /* there are some fields that should not be swapped, move them back */
+    bdrv_move_feature_fields(&tmp, bs_old);
+    bdrv_move_feature_fields(bs_old, bs_new);
+    bdrv_move_feature_fields(bs_new, &tmp);
+
+    /* bs_new shouldn't be in bdrv_states even after the swap!  */
+    assert(bs_new->device_name[0] == '\0');
+
+    /* Check a few fields that should remain attached to the device */
+    assert(bs_new->dev == NULL);
+    assert(bs_new->job == NULL);
+    assert(bs_new->in_use == 0);
+    assert(bs_new->io_limits_enabled == false);
+    assert(bs_new->block_timer == NULL);
+
+    bdrv_rebind(bs_new);
+    bdrv_rebind(bs_old);
+}
+
+/*
+ * Add new bs contents at the top of an image chain while the chain is
+ * live, while keeping required fields on the top layer.
+ *
+ * This will modify the BlockDriverState fields, and swap contents
+ * between bs_new and bs_top. Both bs_new and bs_top are modified.
+ *
+ * bs_new is required to be anonymous.
+ *
+ * This function does not create any image files.
+ */
+void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
+{
+    bdrv_swap(bs_new, bs_top);
+
+    /* The contents of 'tmp' will become bs_top, as we are
+     * swapping bs_new and bs_top contents. */
+    bs_top->backing_hd = bs_new;
+    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
+    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
+            bs_new->filename);
+    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
+            bs_new->drv ? bs_new->drv->format_name : "");
+}
+
+void bdrv_delete(BlockDriverState *bs)
+{
+    assert(!bs->dev);
+    assert(!bs->job);
+    assert(!bs->in_use);
+
+    /* remove from list, if necessary */
+    bdrv_make_anon(bs);
+
+    bdrv_close(bs);
+
+    g_free(bs);
+}
+
+int bdrv_attach_dev(BlockDriverState *bs, void *dev)
+/* TODO change to DeviceState *dev when all users are qdevified */
+{
+    if (bs->dev) {
+        return -EBUSY;
+    }
+    bs->dev = dev;
+    bdrv_iostatus_reset(bs);
+    return 0;
+}
+
+/* TODO qdevified devices don't use this, remove when devices are qdevified */
+void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
+{
+    if (bdrv_attach_dev(bs, dev) < 0) {
+        abort();
+    }
+}
+
+void bdrv_detach_dev(BlockDriverState *bs, void *dev)
+/* TODO change to DeviceState *dev when all users are qdevified */
+{
+    assert(bs->dev == dev);
+    bs->dev = NULL;
+    bs->dev_ops = NULL;
+    bs->dev_opaque = NULL;
+    bs->buffer_alignment = 512;
+}
+
+/* TODO change to return DeviceState * when all users are qdevified */
+void *bdrv_get_attached_dev(BlockDriverState *bs)
+{
+    return bs->dev;
+}
+
+void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
+                      void *opaque)
+{
+    bs->dev_ops = ops;
+    bs->dev_opaque = opaque;
+}
+
+void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
+                               enum MonitorEvent ev,
+                               BlockErrorAction action, bool is_read)
+{
+    QObject *data;
+    const char *action_str;
+
+    switch (action) {
+    case BDRV_ACTION_REPORT:
+        action_str = "report";
+        break;
+    case BDRV_ACTION_IGNORE:
+        action_str = "ignore";
+        break;
+    case BDRV_ACTION_STOP:
+        action_str = "stop";
+        break;
+    default:
+        abort();
+    }
+
+    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
+                              bdrv->device_name,
+                              action_str,
+                              is_read ? "read" : "write");
+    monitor_protocol_event(ev, data);
+
+    qobject_decref(data);
+}
+
+static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
+{
+    QObject *data;
+
+    data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
+                              bdrv_get_device_name(bs), ejected);
+    monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
+
+    qobject_decref(data);
+}
+
+static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
+{
+    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
+        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
+        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
+        if (tray_was_closed) {
+            /* tray open */
+            bdrv_emit_qmp_eject_event(bs, true);
+        }
+        if (load) {
+            /* tray close */
+            bdrv_emit_qmp_eject_event(bs, false);
+        }
+    }
+}
+
+bool bdrv_dev_has_removable_media(BlockDriverState *bs)
+{
+    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
+}
+
+void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
+{
+    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
+        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
+    }
+}
+
+bool bdrv_dev_is_tray_open(BlockDriverState *bs)
+{
+    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
+        return bs->dev_ops->is_tray_open(bs->dev_opaque);
+    }
+    return false;
+}
+
+static void bdrv_dev_resize_cb(BlockDriverState *bs)
+{
+    if (bs->dev_ops && bs->dev_ops->resize_cb) {
+        bs->dev_ops->resize_cb(bs->dev_opaque);
+    }
+}
+
+bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
+{
+    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
+        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
+    }
+    return false;
+}
+
+/*
+ * Run consistency checks on an image
+ *
+ * Returns 0 if the check could be completed (it doesn't mean that the image is
+ * free of errors) or -errno when an internal error occurred. The results of the
+ * check are stored in res.
+ */
+int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
+{
+    if (bs->drv->bdrv_check == NULL) {
+        return -ENOTSUP;
+    }
+
+    memset(res, 0, sizeof(*res));
+    return bs->drv->bdrv_check(bs, res, fix);
+}
+
+#define COMMIT_BUF_SECTORS 2048
+
+/* commit COW file into the raw image */
+int bdrv_commit(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    int64_t sector, total_sectors;
+    int n, ro, open_flags;
+    int ret = 0;
+    uint8_t *buf;
+    char filename[PATH_MAX];
+
+    if (!drv)
+        return -ENOMEDIUM;
+    
+    if (!bs->backing_hd) {
+        return -ENOTSUP;
+    }
+
+    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
+        return -EBUSY;
+    }
+
+    ro = bs->backing_hd->read_only;
+    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
+    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
+    open_flags =  bs->backing_hd->open_flags;
+
+    if (ro) {
+        if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
+            return -EACCES;
+        }
+    }
+
+    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
+
+    for (sector = 0; sector < total_sectors; sector += n) {
+        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
+
+            if (bdrv_read(bs, sector, buf, n) != 0) {
+                ret = -EIO;
+                goto ro_cleanup;
+            }
+
+            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
+                ret = -EIO;
+                goto ro_cleanup;
+            }
+        }
+    }
+
+    if (drv->bdrv_make_empty) {
+        ret = drv->bdrv_make_empty(bs);
+        bdrv_flush(bs);
+    }
+
+    /*
+     * Make sure all data we wrote to the backing device is actually
+     * stable on disk.
+     */
+    if (bs->backing_hd)
+        bdrv_flush(bs->backing_hd);
+
+ro_cleanup:
+    g_free(buf);
+
+    if (ro) {
+        /* ignoring error return here */
+        bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
+    }
+
+    return ret;
+}
+
+int bdrv_commit_all(void)
+{
+    BlockDriverState *bs;
+
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        if (bs->drv && bs->backing_hd) {
+            int ret = bdrv_commit(bs);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Remove an active request from the tracked requests list
+ *
+ * This function should be called when a tracked request is completing.
+ */
+static void tracked_request_end(BdrvTrackedRequest *req)
+{
+    QLIST_REMOVE(req, list);
+    qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+/**
+ * Add an active request to the tracked requests list
+ */
+static void tracked_request_begin(BdrvTrackedRequest *req,
+                                  BlockDriverState *bs,
+                                  int64_t sector_num,
+                                  int nb_sectors, bool is_write)
+{
+    *req = (BdrvTrackedRequest){
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .is_write = is_write,
+        .co = qemu_coroutine_self(),
+    };
+
+    qemu_co_queue_init(&req->wait_queue);
+
+    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+}
+
+/**
+ * Round a region to cluster boundaries
+ */
+void bdrv_round_to_clusters(BlockDriverState *bs,
+                            int64_t sector_num, int nb_sectors,
+                            int64_t *cluster_sector_num,
+                            int *cluster_nb_sectors)
+{
+    BlockDriverInfo bdi;
+
+    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+        *cluster_sector_num = sector_num;
+        *cluster_nb_sectors = nb_sectors;
+    } else {
+        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
+        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+                                            nb_sectors, c);
+    }
+}
+
+static bool tracked_request_overlaps(BdrvTrackedRequest *req,
+                                     int64_t sector_num, int nb_sectors) {
+    /*        aaaa   bbbb */
+    if (sector_num >= req->sector_num + req->nb_sectors) {
+        return false;
+    }
+    /* bbbb   aaaa        */
+    if (req->sector_num >= sector_num + nb_sectors) {
+        return false;
+    }
+    return true;
+}
+
+static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors)
+{
+    BdrvTrackedRequest *req;
+    int64_t cluster_sector_num;
+    int cluster_nb_sectors;
+    bool retry;
+
+    /* If we touch the same cluster it counts as an overlap.  This guarantees
+     * that allocating writes will be serialized and not race with each other
+     * for the same cluster.  For example, in copy-on-read it ensures that the
+     * CoR read and write operations are atomic and guest writes cannot
+     * interleave between them.
+     */
+    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+                           &cluster_sector_num, &cluster_nb_sectors);
+
+    do {
+        retry = false;
+        QLIST_FOREACH(req, &bs->tracked_requests, list) {
+            if (tracked_request_overlaps(req, cluster_sector_num,
+                                         cluster_nb_sectors)) {
+                /* Hitting this means there was a reentrant request, for
+                 * example, a block driver issuing nested requests.  This must
+                 * never happen since it means deadlock.
+                 */
+                assert(qemu_coroutine_self() != req->co);
+
+                qemu_co_queue_wait(&req->wait_queue);
+                retry = true;
+                break;
+            }
+        }
+    } while (retry);
+}
+
+/*
+ * Return values:
+ * 0        - success
+ * -EINVAL  - backing format specified, but no file
+ * -ENOSPC  - can't update the backing file because no space is left in the
+ *            image file header
+ * -ENOTSUP - format driver doesn't support changing the backing file
+ */
+int bdrv_change_backing_file(BlockDriverState *bs,
+    const char *backing_file, const char *backing_fmt)
+{
+    BlockDriver *drv = bs->drv;
+    int ret;
+
+    /* Backing file format doesn't make sense without a backing file */
+    if (backing_fmt && !backing_file) {
+        return -EINVAL;
+    }
+
+    if (drv->bdrv_change_backing_file != NULL) {
+        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
+    } else {
+        ret = -ENOTSUP;
+    }
+
+    if (ret == 0) {
+        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
+        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
+    }
+    return ret;
+}
+
+/*
+ * Finds the image layer in the chain that has 'bs' as its backing file.
+ *
+ * active is the current topmost image.
+ *
+ * Returns NULL if bs is not found in active's image chain,
+ * or if active == bs.
+ */
+BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
+                                    BlockDriverState *bs)
+{
+    BlockDriverState *overlay = NULL;
+    BlockDriverState *intermediate;
+
+    assert(active != NULL);
+    assert(bs != NULL);
+
+    /* if bs is the same as active, then by definition it has no overlay
+     */
+    if (active == bs) {
+        return NULL;
+    }
+
+    intermediate = active;
+    while (intermediate->backing_hd) {
+        if (intermediate->backing_hd == bs) {
+            overlay = intermediate;
+            break;
+        }
+        intermediate = intermediate->backing_hd;
+    }
+
+    return overlay;
+}
+
+typedef struct BlkIntermediateStates {
+    BlockDriverState *bs;
+    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
+} BlkIntermediateStates;
+
+
+/*
+ * Drops images above 'base' up to and including 'top', and sets the image
+ * above 'top' to have base as its backing file.
+ *
+ * Requires that the overlay to 'top' is opened r/w, so that the backing file
+ * information in 'bs' can be properly updated.
+ *
+ * E.g., this will convert the following chain:
+ * bottom <- base <- intermediate <- top <- active
+ *
+ * to
+ *
+ * bottom <- base <- active
+ *
+ * It is allowed for bottom==base, in which case it converts:
+ *
+ * base <- intermediate <- top <- active
+ *
+ * to
+ *
+ * base <- active
+ *
+ * Error conditions:
+ *  if active == top, that is considered an error
+ *
+ */
+int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
+                           BlockDriverState *base)
+{
+    BlockDriverState *intermediate;
+    BlockDriverState *base_bs = NULL;
+    BlockDriverState *new_top_bs = NULL;
+    BlkIntermediateStates *intermediate_state, *next;
+    int ret = -EIO;
+
+    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
+    QSIMPLEQ_INIT(&states_to_delete);
+
+    if (!top->drv || !base->drv) {
+        goto exit;
+    }
+
+    new_top_bs = bdrv_find_overlay(active, top);
+
+    if (new_top_bs == NULL) {
+        /* we could not find the image above 'top', this is an error */
+        goto exit;
+    }
+
+    /* special case of new_top_bs->backing_hd already pointing to base - nothing
+     * to do, no intermediate images */
+    if (new_top_bs->backing_hd == base) {
+        ret = 0;
+        goto exit;
+    }
+
+    intermediate = top;
+
+    /* now we will go down through the list, and add each BDS we find
+     * into our deletion queue, until we hit the 'base'
+     */
+    while (intermediate) {
+        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
+        intermediate_state->bs = intermediate;
+        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
+
+        if (intermediate->backing_hd == base) {
+            base_bs = intermediate->backing_hd;
+            break;
+        }
+        intermediate = intermediate->backing_hd;
+    }
+    if (base_bs == NULL) {
+        /* something went wrong, we did not end at the base. safely
+         * unravel everything, and exit with error */
+        goto exit;
+    }
+
+    /* success - we can delete the intermediate states, and link top->base */
+    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
+                                   base_bs->drv ? base_bs->drv->format_name : "");
+    if (ret) {
+        goto exit;
+    }
+    new_top_bs->backing_hd = base_bs;
+
+
+    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
+        /* so that bdrv_close() does not recursively close the chain */
+        intermediate_state->bs->backing_hd = NULL;
+        bdrv_delete(intermediate_state->bs);
+    }
+    ret = 0;
+
+exit:
+    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
+        g_free(intermediate_state);
+    }
+    return ret;
+}
+
+
+static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
+                                   size_t size)
+{
+    int64_t len;
+
+    if (!bdrv_is_inserted(bs))
+        return -ENOMEDIUM;
+
+    if (bs->growable)
+        return 0;
+
+    len = bdrv_getlength(bs);
+
+    if (offset < 0)
+        return -EIO;
+
+    if ((offset > len) || (len - offset < size))
+        return -EIO;
+
+    return 0;
+}
+
+static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
+                              int nb_sectors)
+{
+    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
+                                   nb_sectors * BDRV_SECTOR_SIZE);
+}
+
+typedef struct RwCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    QEMUIOVector *qiov;
+    bool is_write;
+    int ret;
+} RwCo;
+
+static void coroutine_fn bdrv_rw_co_entry(void *opaque)
+{
+    RwCo *rwco = opaque;
+
+    if (!rwco->is_write) {
+        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
+                                     rwco->nb_sectors, rwco->qiov, 0);
+    } else {
+        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
+                                      rwco->nb_sectors, rwco->qiov, 0);
+    }
+}
+
+/*
+ * Process a vectored synchronous request using coroutines
+ */
+static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
+                       QEMUIOVector *qiov, bool is_write)
+{
+    Coroutine *co;
+    RwCo rwco = {
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
+        .qiov = qiov,
+        .is_write = is_write,
+        .ret = NOT_DONE,
+    };
+    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    /**
+     * In sync call context, when the vcpu is blocked, this throttling timer
+     * will not fire; so the I/O throttling function has to be disabled here
+     * if it has been enabled.
+     */
+    if (bs->io_limits_enabled) {
+        fprintf(stderr, "Disabling I/O throttling on '%s' due "
+                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
+        bdrv_io_limits_disable(bs);
+    }
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_rw_co_entry(&rwco);
+    } else {
+        co = qemu_coroutine_create(bdrv_rw_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            qemu_aio_wait();
+        }
+    }
+    return rwco.ret;
+}
+
+/*
+ * Process a synchronous request using coroutines
+ */
+static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
+                      int nb_sectors, bool is_write)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = (void *)buf,
+        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+    };
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_rwv_co(bs, sector_num, &qiov, is_write);
+}
+
+/* return < 0 if error. See bdrv_write() for the return codes */
+int bdrv_read(BlockDriverState *bs, int64_t sector_num,
+              uint8_t *buf, int nb_sectors)
+{
+    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
+}
+
+/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
+int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
+                          uint8_t *buf, int nb_sectors)
+{
+    bool enabled;
+    int ret;
+
+    enabled = bs->io_limits_enabled;
+    bs->io_limits_enabled = false;
+    ret = bdrv_read(bs, 0, buf, 1);
+    bs->io_limits_enabled = enabled;
+    return ret;
+}
+
+/* Return < 0 if error. Important errors are:
+  -EIO         generic I/O error (may happen for all errors)
+  -ENOMEDIUM   No media inserted.
+  -EINVAL      Invalid sector number or nb_sectors
+  -EACCES      Trying to write a read-only device
+*/
+int bdrv_write(BlockDriverState *bs, int64_t sector_num,
+               const uint8_t *buf, int nb_sectors)
+{
+    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
+}
+
+int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
+{
+    return bdrv_rwv_co(bs, sector_num, qiov, true);
+}
+
+int bdrv_pread(BlockDriverState *bs, int64_t offset,
+               void *buf, int count1)
+{
+    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
+    int len, nb_sectors, count;
+    int64_t sector_num;
+    int ret;
+
+    count = count1;
+    /* first read to align to sector start */
+    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
+    if (len > count)
+        len = count;
+    sector_num = offset >> BDRV_SECTOR_BITS;
+    if (len > 0) {
+        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
+            return ret;
+        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
+        count -= len;
+        if (count == 0)
+            return count1;
+        sector_num++;
+        buf += len;
+    }
+
+    /* read the sectors "in place" */
+    nb_sectors = count >> BDRV_SECTOR_BITS;
+    if (nb_sectors > 0) {
+        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
+            return ret;
+        sector_num += nb_sectors;
+        len = nb_sectors << BDRV_SECTOR_BITS;
+        buf += len;
+        count -= len;
+    }
+
+    /* add data from the last sector */
+    if (count > 0) {
+        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
+            return ret;
+        memcpy(buf, tmp_buf, count);
+    }
+    return count1;
+}
+
+int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
+{
+    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
+    int len, nb_sectors, count;
+    int64_t sector_num;
+    int ret;
+
+    count = qiov->size;
+
+    /* first write to align to sector start */
+    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
+    if (len > count)
+        len = count;
+    sector_num = offset >> BDRV_SECTOR_BITS;
+    if (len > 0) {
+        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
+            return ret;
+        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
+                          len);
+        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
+            return ret;
+        count -= len;
+        if (count == 0)
+            return qiov->size;
+        sector_num++;
+    }
+
+    /* write the sectors "in place" */
+    nb_sectors = count >> BDRV_SECTOR_BITS;
+    if (nb_sectors > 0) {
+        QEMUIOVector qiov_inplace;
+
+        qemu_iovec_init(&qiov_inplace, qiov->niov);
+        qemu_iovec_concat(&qiov_inplace, qiov, len,
+                          nb_sectors << BDRV_SECTOR_BITS);
+        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
+        qemu_iovec_destroy(&qiov_inplace);
+        if (ret < 0) {
+            return ret;
+        }
+
+        sector_num += nb_sectors;
+        len = nb_sectors << BDRV_SECTOR_BITS;
+        count -= len;
+    }
+
+    /* add data from the last sector */
+    if (count > 0) {
+        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
+            return ret;
+        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
+        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
+            return ret;
+    }
+    return qiov->size;
+}
+
+int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
+                const void *buf, int count1)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base   = (void *) buf,
+        .iov_len    = count1,
+    };
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_pwritev(bs, offset, &qiov);
+}
+
+/*
+ * Writes to the file and ensures that no writes are reordered across this
+ * request (acts as a barrier)
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
+    const void *buf, int count)
+{
+    int ret;
+
+    ret = bdrv_pwrite(bs, offset, buf, count);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* No flush needed for cache modes that already do it */
+    if (bs->enable_write_cache) {
+        bdrv_flush(bs);
+    }
+
+    return 0;
+}
+
+static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    /* Perform I/O through a temporary buffer so that users who scribble over
+     * their read buffer while the operation is in progress do not end up
+     * modifying the image file.  This is critical for zero-copy guest I/O
+     * where anything might happen inside guest memory.
+     */
+    void *bounce_buffer;
+
+    BlockDriver *drv = bs->drv;
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    int64_t cluster_sector_num;
+    int cluster_nb_sectors;
+    size_t skip_bytes;
+    int ret;
+
+    /* Cover entire cluster so no additional backing file I/O is required when
+     * allocating cluster in the image file.
+     */
+    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+                           &cluster_sector_num, &cluster_nb_sectors);
+
+    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
+                                   cluster_sector_num, cluster_nb_sectors);
+
+    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
+    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+                             &bounce_qiov);
+    if (ret < 0) {
+        goto err;
+    }
+
+    if (drv->bdrv_co_write_zeroes &&
+        buffer_is_zero(bounce_buffer, iov.iov_len)) {
+        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
+                                      cluster_nb_sectors);
+    } else {
+        /* This does not change the data on the disk, it is not necessary
+         * to flush even in cache=writethrough mode.
+         */
+        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+                                  &bounce_qiov);
+    }
+
+    if (ret < 0) {
+        /* It might be okay to ignore write errors for guest requests.  If this
+         * is a deliberate copy-on-read then we don't want to ignore the error.
+         * Simply report it in all cases.
+         */
+        goto err;
+    }
+
+    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
+    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
+                        nb_sectors * BDRV_SECTOR_SIZE);
+
+err:
+    qemu_vfree(bounce_buffer);
+    return ret;
+}
+
+/*
+ * Handle a read request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
+    int ret;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+        return -EIO;
+    }
+
+    /* throttling disk read I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, false, nb_sectors);
+    }
+
+    if (bs->copy_on_read) {
+        flags |= BDRV_REQ_COPY_ON_READ;
+    }
+    if (flags & BDRV_REQ_COPY_ON_READ) {
+        bs->copy_on_read_in_flight++;
+    }
+
+    if (bs->copy_on_read_in_flight) {
+        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+    }
+
+    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
+
+    if (flags & BDRV_REQ_COPY_ON_READ) {
+        int pnum;
+
+        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
+        if (ret < 0) {
+            goto out;
+        }
+
+        if (!ret || pnum != nb_sectors) {
+            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+            goto out;
+        }
+    }
+
+    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+
+out:
+    tracked_request_end(&req);
+
+    if (flags & BDRV_REQ_COPY_ON_READ) {
+        bs->copy_on_read_in_flight--;
+    }
+
+    return ret;
+}
+
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+                            BDRV_REQ_COPY_ON_READ);
+}
+
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors)
+{
+    BlockDriver *drv = bs->drv;
+    QEMUIOVector qiov;
+    struct iovec iov;
+    int ret;
+
+    /* TODO Emulate only part of misaligned requests instead of letting block
+     * drivers return -ENOTSUP and emulate everything */
+
+    /* First try the efficient write zeroes operation */
+    if (drv->bdrv_co_write_zeroes) {
+        ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
+        if (ret != -ENOTSUP) {
+            return ret;
+        }
+    }
+
+    /* Fall back to bounce buffer if write zeroes is unsupported */
+    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
+    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+    memset(iov.iov_base, 0, iov.iov_len);
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
+
+    qemu_vfree(iov.iov_base);
+    return ret;
+}
+
+/*
+ * Handle a write request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
+    int ret;
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+    if (bs->read_only) {
+        return -EACCES;
+    }
+    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+        return -EIO;
+    }
+
+    /* throttling disk write I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, true, nb_sectors);
+    }
+
+    if (bs->copy_on_read_in_flight) {
+        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+    }
+
+    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
+
+    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
+
+    if (ret < 0) {
+        /* Do nothing, write notifier decided to fail this request */
+    } else if (flags & BDRV_REQ_ZERO_WRITE) {
+        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
+    } else {
+        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+    }
+
+    if (ret == 0 && !bs->enable_write_cache) {
+        ret = bdrv_co_flush(bs);
+    }
+
+    if (bs->dirty_bitmap) {
+        bdrv_set_dirty(bs, sector_num, nb_sectors);
+    }
+
+    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
+        bs->wr_highest_sector = sector_num + nb_sectors - 1;
+    }
+
+    tracked_request_end(&req);
+
+    return ret;
+}
+
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
+                                      int64_t sector_num, int nb_sectors)
+{
+    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
+                             BDRV_REQ_ZERO_WRITE);
+}
+
+/**
+ * Truncate file to 'offset' bytes (needed only for file protocols)
+ */
+int bdrv_truncate(BlockDriverState *bs, int64_t offset)
+{
+    BlockDriver *drv = bs->drv;
+    int ret;
+    if (!drv)
+        return -ENOMEDIUM;
+    if (!drv->bdrv_truncate)
+        return -ENOTSUP;
+    if (bs->read_only)
+        return -EACCES;
+    if (bdrv_in_use(bs))
+        return -EBUSY;
+    ret = drv->bdrv_truncate(bs, offset);
+    if (ret == 0) {
+        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
+        bdrv_dev_resize_cb(bs);
+    }
+    return ret;
+}
+
+/**
+ * Length of a allocated file in bytes. Sparse files are counted by actual
+ * allocated space. Return < 0 if error or unknown.
+ */
+int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_get_allocated_file_size) {
+        return drv->bdrv_get_allocated_file_size(bs);
+    }
+    if (bs->file) {
+        return bdrv_get_allocated_file_size(bs->file);
+    }
+    return -ENOTSUP;
+}
+
+/**
+ * Length of a file in bytes. Return < 0 if error or unknown.
+ */
+int64_t bdrv_getlength(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv)
+        return -ENOMEDIUM;
+
+    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
+        if (drv->bdrv_getlength) {
+            return drv->bdrv_getlength(bs);
+        }
+    }
+    return bs->total_sectors * BDRV_SECTOR_SIZE;
+}
+
+/* return 0 as number of sectors if no device present or error */
+void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
+{
+    int64_t length;
+    length = bdrv_getlength(bs);
+    if (length < 0)
+        length = 0;
+    else
+        length = length >> BDRV_SECTOR_BITS;
+    *nb_sectors_ptr = length;
+}
+
+/* throttling disk io limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+                        BlockIOLimit *io_limits)
+{
+    bs->io_limits = *io_limits;
+    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
+}
+
+void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
+                       BlockdevOnError on_write_error)
+{
+    bs->on_read_error = on_read_error;
+    bs->on_write_error = on_write_error;
+}
+
+BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
+{
+    return is_read ? bs->on_read_error : bs->on_write_error;
+}
+
+BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
+{
+    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
+
+    switch (on_err) {
+    case BLOCKDEV_ON_ERROR_ENOSPC:
+        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
+    case BLOCKDEV_ON_ERROR_STOP:
+        return BDRV_ACTION_STOP;
+    case BLOCKDEV_ON_ERROR_REPORT:
+        return BDRV_ACTION_REPORT;
+    case BLOCKDEV_ON_ERROR_IGNORE:
+        return BDRV_ACTION_IGNORE;
+    default:
+        abort();
+    }
+}
+
+/* This is done by device models because, while the block layer knows
+ * about the error, it does not know whether an operation comes from
+ * the device or the block layer (from a job, for example).
+ */
+void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
+                       bool is_read, int error)
+{
+    assert(error >= 0);
+    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
+    if (action == BDRV_ACTION_STOP) {
+        vm_stop(RUN_STATE_IO_ERROR);
+        bdrv_iostatus_set_err(bs, error);
+    }
+}
+
+int bdrv_is_read_only(BlockDriverState *bs)
+{
+    return bs->read_only;
+}
+
+int bdrv_is_sg(BlockDriverState *bs)
+{
+    return bs->sg;
+}
+
+int bdrv_enable_write_cache(BlockDriverState *bs)
+{
+    return bs->enable_write_cache;
+}
+
+void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
+{
+    bs->enable_write_cache = wce;
+
+    /* so a reopen() will preserve wce */
+    if (wce) {
+        bs->open_flags |= BDRV_O_CACHE_WB;
+    } else {
+        bs->open_flags &= ~BDRV_O_CACHE_WB;
+    }
+}
+
+int bdrv_is_encrypted(BlockDriverState *bs)
+{
+    if (bs->backing_hd && bs->backing_hd->encrypted)
+        return 1;
+    return bs->encrypted;
+}
+
+int bdrv_key_required(BlockDriverState *bs)
+{
+    BlockDriverState *backing_hd = bs->backing_hd;
+
+    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
+        return 1;
+    return (bs->encrypted && !bs->valid_key);
+}
+
+int bdrv_set_key(BlockDriverState *bs, const char *key)
+{
+    int ret;
+    if (bs->backing_hd && bs->backing_hd->encrypted) {
+        ret = bdrv_set_key(bs->backing_hd, key);
+        if (ret < 0)
+            return ret;
+        if (!bs->encrypted)
+            return 0;
+    }
+    if (!bs->encrypted) {
+        return -EINVAL;
+    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
+        return -ENOMEDIUM;
+    }
+    ret = bs->drv->bdrv_set_key(bs, key);
+    if (ret < 0) {
+        bs->valid_key = 0;
+    } else if (!bs->valid_key) {
+        bs->valid_key = 1;
+        /* call the change callback now, we skipped it on open */
+        bdrv_dev_change_media_cb(bs, true);
+    }
+    return ret;
+}
+
+const char *bdrv_get_format_name(BlockDriverState *bs)
+{
+    return bs->drv ? bs->drv->format_name : NULL;
+}
+
+void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
+                         void *opaque)
+{
+    BlockDriver *drv;
+
+    QLIST_FOREACH(drv, &bdrv_drivers, list) {
+        it(opaque, drv->format_name);
+    }
+}
+
+BlockDriverState *bdrv_find(const char *name)
+{
+    BlockDriverState *bs;
+
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        if (!strcmp(name, bs->device_name)) {
+            return bs;
+        }
+    }
+    return NULL;
+}
+
+BlockDriverState *bdrv_next(BlockDriverState *bs)
+{
+    if (!bs) {
+        return QTAILQ_FIRST(&bdrv_states);
+    }
+    return QTAILQ_NEXT(bs, list);
+}
+
+void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
+{
+    BlockDriverState *bs;
+
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        it(opaque, bs);
+    }
+}
+
+const char *bdrv_get_device_name(BlockDriverState *bs)
+{
+    return bs->device_name;
+}
+
+int bdrv_get_flags(BlockDriverState *bs)
+{
+    return bs->open_flags;
+}
+
+int bdrv_flush_all(void)
+{
+    BlockDriverState *bs;
+    int result = 0;
+
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        int ret = bdrv_flush(bs);
+        if (ret < 0 && !result) {
+            result = ret;
+        }
+    }
+
+    return result;
+}
+
+int bdrv_has_zero_init_1(BlockDriverState *bs)
+{
+    return 1;
+}
+
+int bdrv_has_zero_init(BlockDriverState *bs)
+{
+    assert(bs->drv);
+
+    if (bs->drv->bdrv_has_zero_init) {
+        return bs->drv->bdrv_has_zero_init(bs);
+    }
+
+    /* safe default */
+    return 0;
+}
+
+typedef struct BdrvCoIsAllocatedData {
+    BlockDriverState *bs;
+    BlockDriverState *base;
+    int64_t sector_num;
+    int nb_sectors;
+    int *pnum;
+    int ret;
+    bool done;
+} BdrvCoIsAllocatedData;
+
+/*
+ * Returns true iff the specified sector is present in the disk image. Drivers
+ * not implementing the functionality are assumed to not support backing files,
+ * hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, int *pnum)
+{
+    int64_t n;
+
+    if (sector_num >= bs->total_sectors) {
+        *pnum = 0;
+        return 0;
+    }
+
+    n = bs->total_sectors - sector_num;
+    if (n < nb_sectors) {
+        nb_sectors = n;
+    }
+
+    if (!bs->drv->bdrv_co_is_allocated) {
+        *pnum = nb_sectors;
+        return 1;
+    }
+
+    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
+}
+
+/* Coroutine wrapper for bdrv_is_allocated() */
+static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
+{
+    BdrvCoIsAllocatedData *data = opaque;
+    BlockDriverState *bs = data->bs;
+
+    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
+                                     data->pnum);
+    data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_is_allocated().
+ *
+ * See bdrv_co_is_allocated() for details.
+ */
+int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+                      int *pnum)
+{
+    Coroutine *co;
+    BdrvCoIsAllocatedData data = {
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .pnum = pnum,
+        .done = false,
+    };
+
+    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
+    qemu_coroutine_enter(co, &data);
+    while (!data.done) {
+        qemu_aio_wait();
+    }
+    return data.ret;
+}
+
+/*
+ * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
+ *
+ * Return true if the given sector is allocated in any image between
+ * BASE and TOP (inclusive).  BASE can be NULL to check if the given
+ * sector is allocated in any image of the chain.  Return false otherwise.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ *  the specified sector) that are known to be in the same
+ *  allocated/unallocated state.
+ *
+ */
+int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
+                                            BlockDriverState *base,
+                                            int64_t sector_num,
+                                            int nb_sectors, int *pnum)
+{
+    BlockDriverState *intermediate;
+    int ret, n = nb_sectors;
+
+    intermediate = top;
+    while (intermediate && intermediate != base) {
+        int pnum_inter;
+        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
+                                   &pnum_inter);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            *pnum = pnum_inter;
+            return 1;
+        }
+
+        /*
+         * [sector_num, nb_sectors] is unallocated on top but intermediate
+         * might have
+         *
+         * [sector_num+x, nr_sectors] allocated.
+         */
+        if (n > pnum_inter &&
+            (intermediate == top ||
+             sector_num + pnum_inter < intermediate->total_sectors)) {
+            n = pnum_inter;
+        }
+
+        intermediate = intermediate->backing_hd;
+    }
+
+    *pnum = n;
+    return 0;
+}
+
+/* Coroutine wrapper for bdrv_is_allocated_above() */
+static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque)
+{
+    BdrvCoIsAllocatedData *data = opaque;
+    BlockDriverState *top = data->bs;
+    BlockDriverState *base = data->base;
+
+    data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num,
+                                           data->nb_sectors, data->pnum);
+    data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_is_allocated_above().
+ *
+ * See bdrv_co_is_allocated_above() for details.
+ */
+int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
+                            int64_t sector_num, int nb_sectors, int *pnum)
+{
+    Coroutine *co;
+    BdrvCoIsAllocatedData data = {
+        .bs = top,
+        .base = base,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .pnum = pnum,
+        .done = false,
+    };
+
+    co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry);
+    qemu_coroutine_enter(co, &data);
+    while (!data.done) {
+        qemu_aio_wait();
+    }
+    return data.ret;
+}
+
+const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
+{
+    if (bs->backing_hd && bs->backing_hd->encrypted)
+        return bs->backing_file;
+    else if (bs->encrypted)
+        return bs->filename;
+    else
+        return NULL;
+}
+
+void bdrv_get_backing_filename(BlockDriverState *bs,
+                               char *filename, int filename_size)
+{
+    pstrcpy(filename, filename_size, bs->backing_file);
+}
+
+int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                          const uint8_t *buf, int nb_sectors)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv)
+        return -ENOMEDIUM;
+    if (!drv->bdrv_write_compressed)
+        return -ENOTSUP;
+    if (bdrv_check_request(bs, sector_num, nb_sectors))
+        return -EIO;
+
+    assert(!bs->dirty_bitmap);
+
+    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
+}
+
+int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv)
+        return -ENOMEDIUM;
+    if (!drv->bdrv_get_info)
+        return -ENOTSUP;
+    memset(bdi, 0, sizeof(*bdi));
+    return drv->bdrv_get_info(bs, bdi);
+}
+
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+                      int64_t pos, int size)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base   = (void *) buf,
+        .iov_len    = size,
+    };
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_writev_vmstate(bs, &qiov, pos);
+}
+
+int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    } else if (drv->bdrv_save_vmstate) {
+        return drv->bdrv_save_vmstate(bs, qiov, pos);
+    } else if (bs->file) {
+        return bdrv_writev_vmstate(bs->file, qiov, pos);
+    }
+
+    return -ENOTSUP;
+}
+
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+                      int64_t pos, int size)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv)
+        return -ENOMEDIUM;
+    if (drv->bdrv_load_vmstate)
+        return drv->bdrv_load_vmstate(bs, buf, pos, size);
+    if (bs->file)
+        return bdrv_load_vmstate(bs->file, buf, pos, size);
+    return -ENOTSUP;
+}
+
+void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
+{
+    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
+        return;
+    }
+
+    bs->drv->bdrv_debug_event(bs, event);
+}
+
+int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
+                          const char *tag)
+{
+    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
+        bs = bs->file;
+    }
+
+    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
+        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
+    }
+
+    return -ENOTSUP;
+}
+
+int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
+{
+    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
+        bs = bs->file;
+    }
+
+    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
+        return bs->drv->bdrv_debug_resume(bs, tag);
+    }
+
+    return -ENOTSUP;
+}
+
+bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
+{
+    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
+        bs = bs->file;
+    }
+
+    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
+        return bs->drv->bdrv_debug_is_suspended(bs, tag);
+    }
+
+    return false;
+}
+
+int bdrv_is_snapshot(BlockDriverState *bs)
+{
+    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
+}
+
+/* backing_file can either be relative, or absolute, or a protocol.  If it is
+ * relative, it must be relative to the chain.  So, passing in bs->filename
+ * from a BDS as backing_file should not be done, as that may be relative to
+ * the CWD rather than the chain. */
+BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
+        const char *backing_file)
+{
+    char *filename_full = NULL;
+    char *backing_file_full = NULL;
+    char *filename_tmp = NULL;
+    int is_protocol = 0;
+    BlockDriverState *curr_bs = NULL;
+    BlockDriverState *retval = NULL;
+
+    if (!bs || !bs->drv || !backing_file) {
+        return NULL;
+    }
+
+    filename_full     = g_malloc(PATH_MAX);
+    backing_file_full = g_malloc(PATH_MAX);
+    filename_tmp      = g_malloc(PATH_MAX);
+
+    is_protocol = path_has_protocol(backing_file);
+
+    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
+
+        /* If either of the filename paths is actually a protocol, then
+         * compare unmodified paths; otherwise make paths relative */
+        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
+            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
+                retval = curr_bs->backing_hd;
+                break;
+            }
+        } else {
+            /* If not an absolute filename path, make it relative to the current
+             * image's filename path */
+            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
+                         backing_file);
+
+            /* We are going to compare absolute pathnames */
+            if (!realpath(filename_tmp, filename_full)) {
+                continue;
+            }
+
+            /* We need to make sure the backing filename we are comparing against
+             * is relative to the current image filename (or absolute) */
+            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
+                         curr_bs->backing_file);
+
+            if (!realpath(filename_tmp, backing_file_full)) {
+                continue;
+            }
+
+            if (strcmp(backing_file_full, filename_full) == 0) {
+                retval = curr_bs->backing_hd;
+                break;
+            }
+        }
+    }
+
+    g_free(filename_full);
+    g_free(backing_file_full);
+    g_free(filename_tmp);
+    return retval;
+}
+
+int bdrv_get_backing_file_depth(BlockDriverState *bs)
+{
+    if (!bs->drv) {
+        return 0;
+    }
+
+    if (!bs->backing_hd) {
+        return 0;
+    }
+
+    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
+}
+
+BlockDriverState *bdrv_find_base(BlockDriverState *bs)
+{
+    BlockDriverState *curr_bs = NULL;
+
+    if (!bs) {
+        return NULL;
+    }
+
+    curr_bs = bs;
+
+    while (curr_bs->backing_hd) {
+        curr_bs = curr_bs->backing_hd;
+    }
+    return curr_bs;
+}
+
+/**************************************************************/
+/* async I/Os */
+
+BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                                 QEMUIOVector *qiov, int nb_sectors,
+                                 BlockDriverCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
+                                 cb, opaque, false);
+}
+
+BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                                  QEMUIOVector *qiov, int nb_sectors,
+                                  BlockDriverCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
+                                 cb, opaque, true);
+}
+
+
+typedef struct MultiwriteCB {
+    int error;
+    int num_requests;
+    int num_callbacks;
+    struct {
+        BlockDriverCompletionFunc *cb;
+        void *opaque;
+        QEMUIOVector *free_qiov;
+    } callbacks[];
+} MultiwriteCB;
+
+static void multiwrite_user_cb(MultiwriteCB *mcb)
+{
+    int i;
+
+    for (i = 0; i < mcb->num_callbacks; i++) {
+        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
+        if (mcb->callbacks[i].free_qiov) {
+            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
+        }
+        g_free(mcb->callbacks[i].free_qiov);
+    }
+}
+
+static void multiwrite_cb(void *opaque, int ret)
+{
+    MultiwriteCB *mcb = opaque;
+
+    trace_multiwrite_cb(mcb, ret);
+
+    if (ret < 0 && !mcb->error) {
+        mcb->error = ret;
+    }
+
+    mcb->num_requests--;
+    if (mcb->num_requests == 0) {
+        multiwrite_user_cb(mcb);
+        g_free(mcb);
+    }
+}
+
+static int multiwrite_req_compare(const void *a, const void *b)
+{
+    const BlockRequest *req1 = a, *req2 = b;
+
+    /*
+     * Note that we can't simply subtract req2->sector from req1->sector
+     * here as that could overflow the return value.
+     */
+    if (req1->sector > req2->sector) {
+        return 1;
+    } else if (req1->sector < req2->sector) {
+        return -1;
+    } else {
+        return 0;
+    }
+}
+
+/*
+ * Takes a bunch of requests and tries to merge them. Returns the number of
+ * requests that remain after merging.
+ */
+static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
+    int num_reqs, MultiwriteCB *mcb)
+{
+    int i, outidx;
+
+    // Sort requests by start sector
+    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
+
+    // Check if adjacent requests touch the same clusters. If so, combine them,
+    // filling up gaps with zero sectors.
+    outidx = 0;
+    for (i = 1; i < num_reqs; i++) {
+        int merge = 0;
+        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
+
+        // Handle exactly sequential writes and overlapping writes.
+        if (reqs[i].sector <= oldreq_last) {
+            merge = 1;
+        }
+
+        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+            merge = 0;
+        }
+
+        if (merge) {
+            size_t size;
+            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
+            qemu_iovec_init(qiov,
+                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
+
+            // Add the first request to the merged one. If the requests are
+            // overlapping, drop the last sectors of the first request.
+            size = (reqs[i].sector - reqs[outidx].sector) << 9;
+            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
+
+            // We should need to add any zeros between the two requests
+            assert (reqs[i].sector <= oldreq_last);
+
+            // Add the second request
+            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
+
+            reqs[outidx].nb_sectors = qiov->size >> 9;
+            reqs[outidx].qiov = qiov;
+
+            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
+        } else {
+            outidx++;
+            reqs[outidx].sector     = reqs[i].sector;
+            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
+            reqs[outidx].qiov       = reqs[i].qiov;
+        }
+    }
+
+    return outidx + 1;
+}
+
+/*
+ * Submit multiple AIO write requests at once.
+ *
+ * On success, the function returns 0 and all requests in the reqs array have
+ * been submitted. In error case this function returns -1, and any of the
+ * requests may or may not be submitted yet. In particular, this means that the
+ * callback will be called for some of the requests, for others it won't. The
+ * caller must check the error field of the BlockRequest to wait for the right
+ * callbacks (if error != 0, no callback will be called).
+ *
+ * The implementation may modify the contents of the reqs array, e.g. to merge
+ * requests. However, the fields opaque and error are left unmodified as they
+ * are used to signal failure for a single request to the caller.
+ */
+int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
+{
+    MultiwriteCB *mcb;
+    int i;
+
+    /* don't submit writes if we don't have a medium */
+    if (bs->drv == NULL) {
+        for (i = 0; i < num_reqs; i++) {
+            reqs[i].error = -ENOMEDIUM;
+        }
+        return -1;
+    }
+
+    if (num_reqs == 0) {
+        return 0;
+    }
+
+    // Create MultiwriteCB structure
+    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
+    mcb->num_requests = 0;
+    mcb->num_callbacks = num_reqs;
+
+    for (i = 0; i < num_reqs; i++) {
+        mcb->callbacks[i].cb = reqs[i].cb;
+        mcb->callbacks[i].opaque = reqs[i].opaque;
+    }
+
+    // Check for mergable requests
+    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+
+    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
+
+    /* Run the aio requests. */
+    mcb->num_requests = num_reqs;
+    for (i = 0; i < num_reqs; i++) {
+        bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
+            reqs[i].nb_sectors, multiwrite_cb, mcb);
+    }
+
+    return 0;
+}
+
+void bdrv_aio_cancel(BlockDriverAIOCB *acb)
+{
+    acb->aiocb_info->cancel(acb);
+}
+
+/* block I/O throttling */
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+                 bool is_write, double elapsed_time, uint64_t *wait)
+{
+    uint64_t bps_limit = 0;
+    uint64_t extension;
+    double   bytes_limit, bytes_base, bytes_res;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.bps[is_write]) {
+        bps_limit = bs->io_limits.bps[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    bytes_limit = bps_limit * slice_time;
+    bytes_base  = bs->slice_submitted.bytes[is_write];
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bytes_base += bs->slice_submitted.bytes[!is_write];
+    }
+
+    /* bytes_base: the bytes of data which have been read/written; and
+     *             it is obtained from the history statistic info.
+     * bytes_res: the remaining bytes of data which need to be read/written.
+     * (bytes_base + bytes_res) / bps_limit: used to calcuate
+     *             the total time for completing reading/writting all data.
+     */
+    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+    if (bytes_base + bytes_res <= bytes_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
+
+    /* When the I/O rate at runtime exceeds the limits,
+     * bs->slice_end need to be extended in order that the current statistic
+     * info can be kept until the timer fire, so it is increased and tuned
+     * based on the result of experiment.
+     */
+    extension = wait_time * NANOSECONDS_PER_SECOND;
+    extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) *
+                BLOCK_IO_SLICE_TIME;
+    bs->slice_end += extension;
+    if (wait) {
+        *wait = wait_time * NANOSECONDS_PER_SECOND;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+                             double elapsed_time, uint64_t *wait)
+{
+    uint64_t iops_limit = 0;
+    double   ios_limit, ios_base;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.iops[is_write]) {
+        iops_limit = bs->io_limits.iops[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    ios_limit  = iops_limit * slice_time;
+    ios_base   = bs->slice_submitted.ios[is_write];
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        ios_base += bs->slice_submitted.ios[!is_write];
+    }
+
+    if (ios_base + 1 <= ios_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch, in seconds */
+    wait_time = (ios_base + 1) / iops_limit;
+    if (wait_time > elapsed_time) {
+        wait_time = wait_time - elapsed_time;
+    } else {
+        wait_time = 0;
+    }
+
+    /* Exceeded current slice, extend it by another slice time */
+    bs->slice_end += BLOCK_IO_SLICE_TIME;
+    if (wait) {
+        *wait = wait_time * NANOSECONDS_PER_SECOND;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+                           bool is_write, int64_t *wait)
+{
+    int64_t  now, max_wait;
+    uint64_t bps_wait = 0, iops_wait = 0;
+    double   elapsed_time;
+    int      bps_ret, iops_ret;
+
+    now = qemu_get_clock_ns(vm_clock);
+    if (now > bs->slice_end) {
+        bs->slice_start = now;
+        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
+        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
+    }
+
+    elapsed_time  = now - bs->slice_start;
+    elapsed_time  /= (NANOSECONDS_PER_SECOND);
+
+    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
+                                      is_write, elapsed_time, &bps_wait);
+    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+                                      elapsed_time, &iops_wait);
+    if (bps_ret || iops_ret) {
+        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+        if (wait) {
+            *wait = max_wait;
+        }
+
+        now = qemu_get_clock_ns(vm_clock);
+        if (bs->slice_end < now + max_wait) {
+            bs->slice_end = now + max_wait;
+        }
+
+        return true;
+    }
+
+    if (wait) {
+        *wait = 0;
+    }
+
+    bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors *
+                                           BDRV_SECTOR_SIZE;
+    bs->slice_submitted.ios[is_write]++;
+
+    return false;
+}
+
+/**************************************************************/
+/* async block device emulation */
+
+typedef struct BlockDriverAIOCBSync {
+    BlockDriverAIOCB common;
+    QEMUBH *bh;
+    int ret;
+    /* vector translation state */
+    QEMUIOVector *qiov;
+    uint8_t *bounce;
+    int is_write;
+} BlockDriverAIOCBSync;
+
+static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
+{
+    BlockDriverAIOCBSync *acb =
+        container_of(blockacb, BlockDriverAIOCBSync, common);
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_aio_release(acb);
+}
+
+static const AIOCBInfo bdrv_em_aiocb_info = {
+    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
+    .cancel             = bdrv_aio_cancel_em,
+};
+
+static void bdrv_aio_bh_cb(void *opaque)
+{
+    BlockDriverAIOCBSync *acb = opaque;
+
+    if (!acb->is_write)
+        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+    qemu_vfree(acb->bounce);
+    acb->common.cb(acb->common.opaque, acb->ret);
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            QEMUIOVector *qiov,
+                                            int nb_sectors,
+                                            BlockDriverCompletionFunc *cb,
+                                            void *opaque,
+                                            int is_write)
+
+{
+    BlockDriverAIOCBSync *acb;
+
+    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
+    acb->is_write = is_write;
+    acb->qiov = qiov;
+    acb->bounce = qemu_blockalign(bs, qiov->size);
+    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
+
+    if (is_write) {
+        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
+        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
+    } else {
+        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
+    }
+
+    qemu_bh_schedule(acb->bh);
+
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+
+typedef struct BlockDriverAIOCBCoroutine {
+    BlockDriverAIOCB common;
+    BlockRequest req;
+    bool is_write;
+    bool *done;
+    QEMUBH* bh;
+} BlockDriverAIOCBCoroutine;
+
+static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
+{
+    BlockDriverAIOCBCoroutine *acb =
+        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
+    bool done = false;
+
+    acb->done = &done;
+    while (!done) {
+        qemu_aio_wait();
+    }
+}
+
+static const AIOCBInfo bdrv_em_co_aiocb_info = {
+    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
+    .cancel             = bdrv_aio_co_cancel_em,
+};
+
+static void bdrv_co_em_bh(void *opaque)
+{
+    BlockDriverAIOCBCoroutine *acb = opaque;
+
+    acb->common.cb(acb->common.opaque, acb->req.error);
+
+    if (acb->done) {
+        *acb->done = true;
+    }
+
+    qemu_bh_delete(acb->bh);
+    qemu_aio_release(acb);
+}
+
+/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
+static void coroutine_fn bdrv_co_do_rw(void *opaque)
+{
+    BlockDriverAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    if (!acb->is_write) {
+        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov, 0);
+    } else {
+        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov, 0);
+    }
+
+    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
+    qemu_bh_schedule(acb->bh);
+}
+
+static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                               int64_t sector_num,
+                                               QEMUIOVector *qiov,
+                                               int nb_sectors,
+                                               BlockDriverCompletionFunc *cb,
+                                               void *opaque,
+                                               bool is_write)
+{
+    Coroutine *co;
+    BlockDriverAIOCBCoroutine *acb;
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->req.sector = sector_num;
+    acb->req.nb_sectors = nb_sectors;
+    acb->req.qiov = qiov;
+    acb->is_write = is_write;
+    acb->done = NULL;
+
+    co = qemu_coroutine_create(bdrv_co_do_rw);
+    qemu_coroutine_enter(co, acb);
+
+    return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
+{
+    BlockDriverAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    acb->req.error = bdrv_co_flush(bs);
+    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
+    qemu_bh_schedule(acb->bh);
+}
+
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_flush(bs, opaque);
+
+    Coroutine *co;
+    BlockDriverAIOCBCoroutine *acb;
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->done = NULL;
+
+    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
+    qemu_coroutine_enter(co, acb);
+
+    return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+{
+    BlockDriverAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
+    qemu_bh_schedule(acb->bh);
+}
+
+BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    Coroutine *co;
+    BlockDriverAIOCBCoroutine *acb;
+
+    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->req.sector = sector_num;
+    acb->req.nb_sectors = nb_sectors;
+    acb->done = NULL;
+    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
+    qemu_coroutine_enter(co, acb);
+
+    return &acb->common;
+}
+
+void bdrv_init(void)
+{
+    module_call_init(MODULE_INIT_BLOCK);
+}
+
+void bdrv_init_with_whitelist(void)
+{
+    use_bdrv_whitelist = 1;
+    bdrv_init();
+}
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+                   BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriverAIOCB *acb;
+
+    acb = g_slice_alloc(aiocb_info->aiocb_size);
+    acb->aiocb_info = aiocb_info;
+    acb->bs = bs;
+    acb->cb = cb;
+    acb->opaque = opaque;
+    return acb;
+}
+
+void qemu_aio_release(void *p)
+{
+    BlockDriverAIOCB *acb = p;
+    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+}
+
+/**************************************************************/
+/* Coroutine block device emulation */
+
+typedef struct CoroutineIOCompletion {
+    Coroutine *coroutine;
+    int ret;
+} CoroutineIOCompletion;
+
+static void bdrv_co_io_em_complete(void *opaque, int ret)
+{
+    CoroutineIOCompletion *co = opaque;
+
+    co->ret = ret;
+    qemu_coroutine_enter(co->coroutine, NULL);
+}
+
+static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *iov,
+                                      bool is_write)
+{
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockDriverAIOCB *acb;
+
+    if (is_write) {
+        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
+                                       bdrv_co_io_em_complete, &co);
+    } else {
+        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
+                                      bdrv_co_io_em_complete, &co);
+    }
+
+    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
+    if (!acb) {
+        return -EIO;
+    }
+    qemu_coroutine_yield();
+
+    return co.ret;
+}
+
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
+}
+
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
+}
+
+static void coroutine_fn bdrv_flush_co_entry(void *opaque)
+{
+    RwCo *rwco = opaque;
+
+    rwco->ret = bdrv_co_flush(rwco->bs);
+}
+
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+{
+    int ret;
+
+    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+        return 0;
+    }
+
+    /* Write back cached data to the OS even with cache=unsafe */
+    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
+    if (bs->drv->bdrv_co_flush_to_os) {
+        ret = bs->drv->bdrv_co_flush_to_os(bs);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    /* But don't actually force it to the disk with cache=unsafe */
+    if (bs->open_flags & BDRV_O_NO_FLUSH) {
+        goto flush_parent;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
+    if (bs->drv->bdrv_co_flush_to_disk) {
+        ret = bs->drv->bdrv_co_flush_to_disk(bs);
+    } else if (bs->drv->bdrv_aio_flush) {
+        BlockDriverAIOCB *acb;
+        CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+        };
+
+        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
+        if (acb == NULL) {
+            ret = -EIO;
+        } else {
+            qemu_coroutine_yield();
+            ret = co.ret;
+        }
+    } else {
+        /*
+         * Some block drivers always operate in either writethrough or unsafe
+         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
+         * know how the server works (because the behaviour is hardcoded or
+         * depends on server-side configuration), so we can't ensure that
+         * everything is safe on disk. Returning an error doesn't work because
+         * that would break guests even if the server operates in writethrough
+         * mode.
+         *
+         * Let's hope the user knows what he's doing.
+         */
+        ret = 0;
+    }
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
+     * in the case of cache=unsafe, so there are no useless flushes.
+     */
+flush_parent:
+    return bdrv_co_flush(bs->file);
+}
+
+void bdrv_invalidate_cache(BlockDriverState *bs)
+{
+    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
+        bs->drv->bdrv_invalidate_cache(bs);
+    }
+}
+
+void bdrv_invalidate_cache_all(void)
+{
+    BlockDriverState *bs;
+
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        bdrv_invalidate_cache(bs);
+    }
+}
+
+void bdrv_clear_incoming_migration_all(void)
+{
+    BlockDriverState *bs;
+
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
+    }
+}
+
+int bdrv_flush(BlockDriverState *bs)
+{
+    Coroutine *co;
+    RwCo rwco = {
+        .bs = bs,
+        .ret = NOT_DONE,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_flush_co_entry(&rwco);
+    } else {
+        co = qemu_coroutine_create(bdrv_flush_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            qemu_aio_wait();
+        }
+    }
+
+    return rwco.ret;
+}
+
+static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+{
+    RwCo *rwco = opaque;
+
+    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+}
+
+int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
+                                 int nb_sectors)
+{
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+        return -EIO;
+    } else if (bs->read_only) {
+        return -EROFS;
+    }
+
+    if (bs->dirty_bitmap) {
+        bdrv_reset_dirty(bs, sector_num, nb_sectors);
+    }
+
+    /* Do nothing if disabled.  */
+    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+        return 0;
+    }
+
+    if (bs->drv->bdrv_co_discard) {
+        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
+    } else if (bs->drv->bdrv_aio_discard) {
+        BlockDriverAIOCB *acb;
+        CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+        };
+
+        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
+                                        bdrv_co_io_em_complete, &co);
+        if (acb == NULL) {
+            return -EIO;
+        } else {
+            qemu_coroutine_yield();
+            return co.ret;
+        }
+    } else {
+        return 0;
+    }
+}
+
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+    Coroutine *co;
+    RwCo rwco = {
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .ret = NOT_DONE,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_discard_co_entry(&rwco);
+    } else {
+        co = qemu_coroutine_create(bdrv_discard_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            qemu_aio_wait();
+        }
+    }
+
+    return rwco.ret;
+}
+
+/**************************************************************/
+/* removable device support */
+
+/**
+ * Return TRUE if the media is present
+ */
+int bdrv_is_inserted(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (!drv)
+        return 0;
+    if (!drv->bdrv_is_inserted)
+        return 1;
+    return drv->bdrv_is_inserted(bs);
+}
+
+/**
+ * Return whether the media changed since the last call to this
+ * function, or -ENOTSUP if we don't know.  Most drivers don't know.
+ */
+int bdrv_media_changed(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_media_changed) {
+        return drv->bdrv_media_changed(bs);
+    }
+    return -ENOTSUP;
+}
+
+/**
+ * If eject_flag is TRUE, eject the media. Otherwise, close the tray
+ */
+void bdrv_eject(BlockDriverState *bs, bool eject_flag)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_eject) {
+        drv->bdrv_eject(bs, eject_flag);
+    }
+
+    if (bs->device_name[0] != '\0') {
+        bdrv_emit_qmp_eject_event(bs, eject_flag);
+    }
+}
+
+/**
+ * Lock or unlock the media (if it is locked, the user won't be able
+ * to eject it manually).
+ */
+void bdrv_lock_medium(BlockDriverState *bs, bool locked)
+{
+    BlockDriver *drv = bs->drv;
+
+    trace_bdrv_lock_medium(bs, locked);
+
+    if (drv && drv->bdrv_lock_medium) {
+        drv->bdrv_lock_medium(bs, locked);
+    }
+}
+
+/* needed for generic scsi interface */
+
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_ioctl)
+        return drv->bdrv_ioctl(bs, req, buf);
+    return -ENOTSUP;
+}
+
+BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
+        unsigned long int req, void *buf,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_aio_ioctl)
+        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
+    return NULL;
+}
+
+void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
+{
+    bs->buffer_alignment = align;
+}
+
+void *qemu_blockalign(BlockDriverState *bs, size_t size)
+{
+    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
+}
+
+/*
+ * Check if all memory in this vector is sector aligned.
+ */
+bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+{
+    int i;
+
+    for (i = 0; i < qiov->niov; i++) {
+        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
+{
+    int64_t bitmap_size;
+
+    assert((granularity & (granularity - 1)) == 0);
+
+    if (granularity) {
+        granularity >>= BDRV_SECTOR_BITS;
+        assert(!bs->dirty_bitmap);
+        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
+        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
+    } else {
+        if (bs->dirty_bitmap) {
+            hbitmap_free(bs->dirty_bitmap);
+            bs->dirty_bitmap = NULL;
+        }
+    }
+}
+
+int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
+{
+    if (bs->dirty_bitmap) {
+        return hbitmap_get(bs->dirty_bitmap, sector);
+    } else {
+        return 0;
+    }
+}
+
+void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
+{
+    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
+}
+
+void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
+                    int nr_sectors)
+{
+    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
+}
+
+void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
+                      int nr_sectors)
+{
+    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
+}
+
+int64_t bdrv_get_dirty_count(BlockDriverState *bs)
+{
+    if (bs->dirty_bitmap) {
+        return hbitmap_count(bs->dirty_bitmap);
+    } else {
+        return 0;
+    }
+}
+
+void bdrv_set_in_use(BlockDriverState *bs, int in_use)
+{
+    assert(bs->in_use != in_use);
+    bs->in_use = in_use;
+}
+
+int bdrv_in_use(BlockDriverState *bs)
+{
+    return bs->in_use;
+}
+
+void bdrv_iostatus_enable(BlockDriverState *bs)
+{
+    bs->iostatus_enabled = true;
+    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
+}
+
+/* The I/O status is only enabled if the drive explicitly
+ * enables it _and_ the VM is configured to stop on errors */
+bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
+{
+    return (bs->iostatus_enabled &&
+           (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
+            bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
+            bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
+}
+
+void bdrv_iostatus_disable(BlockDriverState *bs)
+{
+    bs->iostatus_enabled = false;
+}
+
+void bdrv_iostatus_reset(BlockDriverState *bs)
+{
+    if (bdrv_iostatus_is_enabled(bs)) {
+        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
+        if (bs->job) {
+            block_job_iostatus_reset(bs->job);
+        }
+    }
+}
+
+void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
+{
+    assert(bdrv_iostatus_is_enabled(bs));
+    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
+        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
+                                         BLOCK_DEVICE_IO_STATUS_FAILED;
+    }
+}
+
+void
+bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
+        enum BlockAcctType type)
+{
+    assert(type < BDRV_MAX_IOTYPE);
+
+    cookie->bytes = bytes;
+    cookie->start_time_ns = get_clock();
+    cookie->type = type;
+}
+
+void
+bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
+{
+    assert(cookie->type < BDRV_MAX_IOTYPE);
+
+    bs->nr_bytes[cookie->type] += cookie->bytes;
+    bs->nr_ops[cookie->type]++;
+    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
+}
+
+void bdrv_img_create(const char *filename, const char *fmt,
+                     const char *base_filename, const char *base_fmt,
+                     char *options, uint64_t img_size, int flags,
+                     Error **errp, bool quiet)
+{
+    QEMUOptionParameter *param = NULL, *create_options = NULL;
+    QEMUOptionParameter *backing_fmt, *backing_file, *size;
+    BlockDriverState *bs = NULL;
+    BlockDriver *drv, *proto_drv;
+    BlockDriver *backing_drv = NULL;
+    int ret = 0;
+
+    /* Find driver and parse its options */
+    drv = bdrv_find_format(fmt);
+    if (!drv) {
+        error_setg(errp, "Unknown file format '%s'", fmt);
+        return;
+    }
+
+    proto_drv = bdrv_find_protocol(filename, true);
+    if (!proto_drv) {
+        error_setg(errp, "Unknown protocol '%s'", filename);
+        return;
+    }
+
+    create_options = append_option_parameters(create_options,
+                                              drv->create_options);
+    create_options = append_option_parameters(create_options,
+                                              proto_drv->create_options);
+
+    /* Create parameter list with default values */
+    param = parse_option_parameters("", create_options, param);
+
+    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
+
+    /* Parse -o options */
+    if (options) {
+        param = parse_option_parameters(options, create_options, param);
+        if (param == NULL) {
+            error_setg(errp, "Invalid options for file format '%s'.", fmt);
+            goto out;
+        }
+    }
+
+    if (base_filename) {
+        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
+                                 base_filename)) {
+            error_setg(errp, "Backing file not supported for file format '%s'",
+                       fmt);
+            goto out;
+        }
+    }
+
+    if (base_fmt) {
+        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
+            error_setg(errp, "Backing file format not supported for file "
+                             "format '%s'", fmt);
+            goto out;
+        }
+    }
+
+    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
+    if (backing_file && backing_file->value.s) {
+        if (!strcmp(filename, backing_file->value.s)) {
+            error_setg(errp, "Error: Trying to create an image with the "
+                             "same filename as the backing file");
+            goto out;
+        }
+    }
+
+    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
+    if (backing_fmt && backing_fmt->value.s) {
+        backing_drv = bdrv_find_format(backing_fmt->value.s);
+        if (!backing_drv) {
+            error_setg(errp, "Unknown backing file format '%s'",
+                       backing_fmt->value.s);
+            goto out;
+        }
+    }
+
+    // The size for the image must always be specified, with one exception:
+    // If we are using a backing file, we can obtain the size from there
+    size = get_option_parameter(param, BLOCK_OPT_SIZE);
+    if (size && size->value.n == -1) {
+        if (backing_file && backing_file->value.s) {
+            uint64_t size;
+            char buf[32];
+            int back_flags;
+
+            /* backing files always opened read-only */
+            back_flags =
+                flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
+
+            bs = bdrv_new("");
+
+            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
+                            backing_drv);
+            if (ret < 0) {
+                error_setg_errno(errp, -ret, "Could not open '%s'",
+                                 backing_file->value.s);
+                goto out;
+            }
+            bdrv_get_geometry(bs, &size);
+            size *= 512;
+
+            snprintf(buf, sizeof(buf), "%" PRId64, size);
+            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
+        } else {
+            error_setg(errp, "Image creation needs a size parameter");
+            goto out;
+        }
+    }
+
+    if (!quiet) {
+        printf("Formatting '%s', fmt=%s ", filename, fmt);
+        print_option_parameters(param);
+        puts("");
+    }
+    ret = bdrv_create(drv, filename, param);
+    if (ret < 0) {
+        if (ret == -ENOTSUP) {
+            error_setg(errp,"Formatting or formatting option not supported for "
+                            "file format '%s'", fmt);
+        } else if (ret == -EFBIG) {
+            const char *cluster_size_hint = "";
+            if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
+                cluster_size_hint = " (try using a larger cluster size)";
+            }
+            error_setg(errp, "The image size is too large for file format '%s'%s",
+                       fmt, cluster_size_hint);
+        } else {
+            error_setg(errp, "%s: error while creating %s: %s", filename, fmt,
+                       strerror(-ret));
+        }
+    }
+
+out:
+    free_option_parameters(create_options);
+    free_option_parameters(param);
+
+    if (bs) {
+        bdrv_delete(bs);
+    }
+}
+
+AioContext *bdrv_get_aio_context(BlockDriverState *bs)
+{
+    /* Currently BlockDriverState always uses the main loop AioContext */
+    return qemu_get_aio_context();
+}
+
+void bdrv_add_before_write_notifier(BlockDriverState *bs,
+                                    NotifierWithReturn *notifier)
+{
+    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
+}
diff --git a/contrib/qemu/block/qcow.c b/contrib/qemu/block/qcow.c
new file mode 100644
index 000000000..5239bd68f
--- /dev/null
+++ b/contrib/qemu/block/qcow.c
@@ -0,0 +1,914 @@
+/*
+ * Block driver for the QCOW format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include <zlib.h>
+#include "qemu/aes.h"
+#include "migration/migration.h"
+
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES  1
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+
+typedef struct QCowHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t backing_file_offset;
+    uint32_t backing_file_size;
+    uint32_t mtime;
+    uint64_t size; /* in bytes */
+    uint8_t cluster_bits;
+    uint8_t l2_bits;
+    uint32_t crypt_method;
+    uint64_t l1_table_offset;
+} QCowHeader;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct BDRVQcowState {
+    int cluster_bits;
+    int cluster_size;
+    int cluster_sectors;
+    int l2_bits;
+    int l2_size;
+    int l1_size;
+    uint64_t cluster_offset_mask;
+    uint64_t l1_table_offset;
+    uint64_t *l1_table;
+    uint64_t *l2_cache;
+    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
+    uint32_t l2_cache_counts[L2_CACHE_SIZE];
+    uint8_t *cluster_cache;
+    uint8_t *cluster_data;
+    uint64_t cluster_cache_offset;
+    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+    uint32_t crypt_method_header;
+    AES_KEY aes_encrypt_key;
+    AES_KEY aes_decrypt_key;
+    CoMutex lock;
+    Error *migration_blocker;
+} BDRVQcowState;
+
+static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
+
+static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const QCowHeader *cow_header = (const void *)buf;
+
+    if (buf_size >= sizeof(QCowHeader) &&
+        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+        be32_to_cpu(cow_header->version) == QCOW_VERSION)
+        return 100;
+    else
+        return 0;
+}
+
+static int qcow_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    int len, i, shift, ret;
+    QCowHeader header;
+
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    if (ret < 0) {
+        goto fail;
+    }
+    be32_to_cpus(&header.magic);
+    be32_to_cpus(&header.version);
+    be64_to_cpus(&header.backing_file_offset);
+    be32_to_cpus(&header.backing_file_size);
+    be32_to_cpus(&header.mtime);
+    be64_to_cpus(&header.size);
+    be32_to_cpus(&header.crypt_method);
+    be64_to_cpus(&header.l1_table_offset);
+
+    if (header.magic != QCOW_MAGIC) {
+        ret = -EMEDIUMTYPE;
+        goto fail;
+    }
+    if (header.version != QCOW_VERSION) {
+        char version[64];
+        snprintf(version, sizeof(version), "QCOW version %d", header.version);
+        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+            bs->device_name, "qcow", version);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    if (header.size <= 1 || header.cluster_bits < 9) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    if (header.crypt_method > QCOW_CRYPT_AES) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    s->crypt_method_header = header.crypt_method;
+    if (s->crypt_method_header) {
+        bs->encrypted = 1;
+    }
+    s->cluster_bits = header.cluster_bits;
+    s->cluster_size = 1 << s->cluster_bits;
+    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+    s->l2_bits = header.l2_bits;
+    s->l2_size = 1 << s->l2_bits;
+    bs->total_sectors = header.size / 512;
+    s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+
+    /* read the level 1 table */
+    shift = s->cluster_bits + s->l2_bits;
+    s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
+
+    s->l1_table_offset = header.l1_table_offset;
+    s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+
+    ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
+               s->l1_size * sizeof(uint64_t));
+    if (ret < 0) {
+        goto fail;
+    }
+
+    for(i = 0;i < s->l1_size; i++) {
+        be64_to_cpus(&s->l1_table[i]);
+    }
+    /* alloc L2 cache */
+    s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    s->cluster_cache = g_malloc(s->cluster_size);
+    s->cluster_data = g_malloc(s->cluster_size);
+    s->cluster_cache_offset = -1;
+
+    /* read the backing file name */
+    if (header.backing_file_offset != 0) {
+        len = header.backing_file_size;
+        if (len > 1023) {
+            len = 1023;
+        }
+        ret = bdrv_pread(bs->file, header.backing_file_offset,
+                   bs->backing_file, len);
+        if (ret < 0) {
+            goto fail;
+        }
+        bs->backing_file[len] = '\0';
+    }
+
+    /* Disable migration when qcow images are used */
+    error_set(&s->migration_blocker,
+              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+              "qcow", bs->device_name, "live migration");
+    migrate_add_blocker(s->migration_blocker);
+
+    qemu_co_mutex_init(&s->lock);
+    return 0;
+
+ fail:
+    g_free(s->l1_table);
+    g_free(s->l2_cache);
+    g_free(s->cluster_cache);
+    g_free(s->cluster_data);
+    return ret;
+}
+
+
+/* We have nothing to do for QCOW reopen, stubs just return
+ * success */
+static int qcow_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
+static int qcow_set_key(BlockDriverState *bs, const char *key)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint8_t keybuf[16];
+    int len, i;
+
+    memset(keybuf, 0, 16);
+    len = strlen(key);
+    if (len > 16)
+        len = 16;
+    /* XXX: we could compress the chars to 7 bits to increase
+       entropy */
+    for(i = 0;i < len;i++) {
+        keybuf[i] = key[i];
+    }
+    s->crypt_method = s->crypt_method_header;
+
+    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+        return -1;
+    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+        return -1;
+    return 0;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+   supported */
+static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                            uint8_t *out_buf, const uint8_t *in_buf,
+                            int nb_sectors, int enc,
+                            const AES_KEY *key)
+{
+    union {
+        uint64_t ll[2];
+        uint8_t b[16];
+    } ivec;
+    int i;
+
+    for(i = 0; i < nb_sectors; i++) {
+        ivec.ll[0] = cpu_to_le64(sector_num);
+        ivec.ll[1] = 0;
+        AES_cbc_encrypt(in_buf, out_buf, 512, key,
+                        ivec.b, enc);
+        sector_num++;
+        in_buf += 512;
+        out_buf += 512;
+    }
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+                                   uint64_t offset, int allocate,
+                                   int compressed_size,
+                                   int n_start, int n_end)
+{
+    BDRVQcowState *s = bs->opaque;
+    int min_index, i, j, l1_index, l2_index;
+    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+    uint32_t min_count;
+    int new_l2_table;
+
+    l1_index = offset >> (s->l2_bits + s->cluster_bits);
+    l2_offset = s->l1_table[l1_index];
+    new_l2_table = 0;
+    if (!l2_offset) {
+        if (!allocate)
+            return 0;
+        /* allocate a new l2 entry */
+        l2_offset = bdrv_getlength(bs->file);
+        /* round to cluster size */
+        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
+        /* update the L1 entry */
+        s->l1_table[l1_index] = l2_offset;
+        tmp = cpu_to_be64(l2_offset);
+        if (bdrv_pwrite_sync(bs->file,
+                s->l1_table_offset + l1_index * sizeof(tmp),
+                &tmp, sizeof(tmp)) < 0)
+            return 0;
+        new_l2_table = 1;
+    }
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (l2_offset == s->l2_cache_offsets[i]) {
+            /* increment the hit count */
+            if (++s->l2_cache_counts[i] == 0xffffffff) {
+                for(j = 0; j < L2_CACHE_SIZE; j++) {
+                    s->l2_cache_counts[j] >>= 1;
+                }
+            }
+            l2_table = s->l2_cache + (i << s->l2_bits);
+            goto found;
+        }
+    }
+    /* not found: load a new entry in the least used one */
+    min_index = 0;
+    min_count = 0xffffffff;
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (s->l2_cache_counts[i] < min_count) {
+            min_count = s->l2_cache_counts[i];
+            min_index = i;
+        }
+    }
+    l2_table = s->l2_cache + (min_index << s->l2_bits);
+    if (new_l2_table) {
+        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
+                s->l2_size * sizeof(uint64_t)) < 0)
+            return 0;
+    } else {
+        if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+            s->l2_size * sizeof(uint64_t))
+            return 0;
+    }
+    s->l2_cache_offsets[min_index] = l2_offset;
+    s->l2_cache_counts[min_index] = 1;
+ found:
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    if (!cluster_offset ||
+        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
+        if (!allocate)
+            return 0;
+        /* allocate a new cluster */
+        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+            (n_end - n_start) < s->cluster_sectors) {
+            /* if the cluster is already compressed, we must
+               decompress it in the case it is not completely
+               overwritten */
+            if (decompress_cluster(bs, cluster_offset) < 0)
+                return 0;
+            cluster_offset = bdrv_getlength(bs->file);
+            cluster_offset = (cluster_offset + s->cluster_size - 1) &
+                ~(s->cluster_size - 1);
+            /* write the cluster content */
+            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) !=
+                s->cluster_size)
+                return -1;
+        } else {
+            cluster_offset = bdrv_getlength(bs->file);
+            if (allocate == 1) {
+                /* round to cluster size */
+                cluster_offset = (cluster_offset + s->cluster_size - 1) &
+                    ~(s->cluster_size - 1);
+                bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
+                /* if encrypted, we must initialize the cluster
+                   content which won't be written */
+                if (s->crypt_method &&
+                    (n_end - n_start) < s->cluster_sectors) {
+                    uint64_t start_sect;
+                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
+                    memset(s->cluster_data + 512, 0x00, 512);
+                    for(i = 0; i < s->cluster_sectors; i++) {
+                        if (i < n_start || i >= n_end) {
+                            encrypt_sectors(s, start_sect + i,
+                                            s->cluster_data,
+                                            s->cluster_data + 512, 1, 1,
+                                            &s->aes_encrypt_key);
+                            if (bdrv_pwrite(bs->file, cluster_offset + i * 512,
+                                            s->cluster_data, 512) != 512)
+                                return -1;
+                        }
+                    }
+                }
+            } else if (allocate == 2) {
+                cluster_offset |= QCOW_OFLAG_COMPRESSED |
+                    (uint64_t)compressed_size << (63 - s->cluster_bits);
+            }
+        }
+        /* update L2 table */
+        tmp = cpu_to_be64(cluster_offset);
+        l2_table[l2_index] = tmp;
+        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
+                &tmp, sizeof(tmp)) < 0)
+            return 0;
+    }
+    return cluster_offset;
+}
+
+static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster, n;
+    uint64_t cluster_offset;
+
+    qemu_co_mutex_lock(&s->lock);
+    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+    qemu_co_mutex_unlock(&s->lock);
+    index_in_cluster = sector_num & (s->cluster_sectors - 1);
+    n = s->cluster_sectors - index_in_cluster;
+    if (n > nb_sectors)
+        n = nb_sectors;
+    *pnum = n;
+    return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+    z_stream strm1, *strm = &strm1;
+    int ret, out_len;
+
+    memset(strm, 0, sizeof(*strm));
+
+    strm->next_in = (uint8_t *)buf;
+    strm->avail_in = buf_size;
+    strm->next_out = out_buf;
+    strm->avail_out = out_buf_size;
+
+    ret = inflateInit2(strm, -12);
+    if (ret != Z_OK)
+        return -1;
+    ret = inflate(strm, Z_FINISH);
+    out_len = strm->next_out - out_buf;
+    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+        out_len != out_buf_size) {
+        inflateEnd(strm);
+        return -1;
+    }
+    inflateEnd(strm);
+    return 0;
+}
+
+static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, csize;
+    uint64_t coffset;
+
+    coffset = cluster_offset & s->cluster_offset_mask;
+    if (s->cluster_cache_offset != coffset) {
+        csize = cluster_offset >> (63 - s->cluster_bits);
+        csize &= (s->cluster_size - 1);
+        ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
+        if (ret != csize)
+            return -1;
+        if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                              s->cluster_data, csize) < 0) {
+            return -1;
+        }
+        s->cluster_cache_offset = coffset;
+    }
+    return 0;
+}
+
+static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
+                         int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+    int ret = 0, n;
+    uint64_t cluster_offset;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
+    uint8_t *buf;
+    void *orig_buf;
+
+    if (qiov->niov > 1) {
+        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+    } else {
+        orig_buf = NULL;
+        buf = (uint8_t *)qiov->iov->iov_base;
+    }
+
+    qemu_co_mutex_lock(&s->lock);
+
+    while (nb_sectors != 0) {
+        /* prepare next request */
+        cluster_offset = get_cluster_offset(bs, sector_num << 9,
+                                                 0, 0, 0, 0);
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors) {
+            n = nb_sectors;
+        }
+
+        if (!cluster_offset) {
+            if (bs->backing_hd) {
+                /* read from the base image */
+                hd_iov.iov_base = (void *)buf;
+                hd_iov.iov_len = n * 512;
+                qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                    n, &hd_qiov);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto fail;
+                }
+            } else {
+                /* Note: in this case, no need to wait */
+                memset(buf, 0, 512 * n);
+            }
+        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+            /* add AIO support for compressed blocks ? */
+            if (decompress_cluster(bs, cluster_offset) < 0) {
+                goto fail;
+            }
+            memcpy(buf,
+                   s->cluster_cache + index_in_cluster * 512, 512 * n);
+        } else {
+            if ((cluster_offset & 511) != 0) {
+                goto fail;
+            }
+            hd_iov.iov_base = (void *)buf;
+            hd_iov.iov_len = n * 512;
+            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+            qemu_co_mutex_unlock(&s->lock);
+            ret = bdrv_co_readv(bs->file,
+                                (cluster_offset >> 9) + index_in_cluster,
+                                n, &hd_qiov);
+            qemu_co_mutex_lock(&s->lock);
+            if (ret < 0) {
+                break;
+            }
+            if (s->crypt_method) {
+                encrypt_sectors(s, sector_num, buf, buf,
+                                n, 0,
+                                &s->aes_decrypt_key);
+            }
+        }
+        ret = 0;
+
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+
+done:
+    qemu_co_mutex_unlock(&s->lock);
+
+    if (qiov->niov > 1) {
+        qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size);
+        qemu_vfree(orig_buf);
+    }
+
+    return ret;
+
+fail:
+    ret = -EIO;
+    goto done;
+}
+
+static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+                          int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+    uint64_t cluster_offset;
+    const uint8_t *src_buf;
+    int ret = 0, n;
+    uint8_t *cluster_data = NULL;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
+    uint8_t *buf;
+    void *orig_buf;
+
+    s->cluster_cache_offset = -1; /* disable compressed cache */
+
+    if (qiov->niov > 1) {
+        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+        qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
+    } else {
+        orig_buf = NULL;
+        buf = (uint8_t *)qiov->iov->iov_base;
+    }
+
+    qemu_co_mutex_lock(&s->lock);
+
+    while (nb_sectors != 0) {
+
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors) {
+            n = nb_sectors;
+        }
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
+                                            index_in_cluster,
+                                            index_in_cluster + n);
+        if (!cluster_offset || (cluster_offset & 511) != 0) {
+            ret = -EIO;
+            break;
+        }
+        if (s->crypt_method) {
+            if (!cluster_data) {
+                cluster_data = g_malloc0(s->cluster_size);
+            }
+            encrypt_sectors(s, sector_num, cluster_data, buf,
+                            n, 1, &s->aes_encrypt_key);
+            src_buf = cluster_data;
+        } else {
+            src_buf = buf;
+        }
+
+        hd_iov.iov_base = (void *)src_buf;
+        hd_iov.iov_len = n * 512;
+        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+        qemu_co_mutex_unlock(&s->lock);
+        ret = bdrv_co_writev(bs->file,
+                             (cluster_offset >> 9) + index_in_cluster,
+                             n, &hd_qiov);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            break;
+        }
+        ret = 0;
+
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    qemu_co_mutex_unlock(&s->lock);
+
+    if (qiov->niov > 1) {
+        qemu_vfree(orig_buf);
+    }
+    g_free(cluster_data);
+
+    return ret;
+}
+
+static void qcow_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    g_free(s->l1_table);
+    g_free(s->l2_cache);
+    g_free(s->cluster_cache);
+    g_free(s->cluster_data);
+
+    migrate_del_blocker(s->migration_blocker);
+    error_free(s->migration_blocker);
+}
+
+static int qcow_create(const char *filename, QEMUOptionParameter *options)
+{
+    int header_size, backing_filename_len, l1_size, shift, i;
+    QCowHeader header;
+    uint8_t *tmp;
+    int64_t total_size = 0;
+    const char *backing_file = NULL;
+    int flags = 0;
+    int ret;
+    BlockDriverState *qcow_bs;
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / 512;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_file = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
+            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
+        }
+        options++;
+    }
+
+    ret = bdrv_create_file(filename, options);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_truncate(qcow_bs, 0);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be32(QCOW_MAGIC);
+    header.version = cpu_to_be32(QCOW_VERSION);
+    header.size = cpu_to_be64(total_size * 512);
+    header_size = sizeof(header);
+    backing_filename_len = 0;
+    if (backing_file) {
+        if (strcmp(backing_file, "fat:")) {
+            header.backing_file_offset = cpu_to_be64(header_size);
+            backing_filename_len = strlen(backing_file);
+            header.backing_file_size = cpu_to_be32(backing_filename_len);
+            header_size += backing_filename_len;
+        } else {
+            /* special backing file for vvfat */
+            backing_file = NULL;
+        }
+        header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+                                    unmodifyed sectors */
+        header.l2_bits = 12; /* 32 KB L2 tables */
+    } else {
+        header.cluster_bits = 12; /* 4 KB clusters */
+        header.l2_bits = 9; /* 4 KB L2 tables */
+    }
+    header_size = (header_size + 7) & ~7;
+    shift = header.cluster_bits + header.l2_bits;
+    l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
+
+    header.l1_table_offset = cpu_to_be64(header_size);
+    if (flags & BLOCK_FLAG_ENCRYPT) {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+    } else {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+    }
+
+    /* write all the data */
+    ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header));
+    if (ret != sizeof(header)) {
+        goto exit;
+    }
+
+    if (backing_file) {
+        ret = bdrv_pwrite(qcow_bs, sizeof(header),
+            backing_file, backing_filename_len);
+        if (ret != backing_filename_len) {
+            goto exit;
+        }
+    }
+
+    tmp = g_malloc0(BDRV_SECTOR_SIZE);
+    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
+        BDRV_SECTOR_SIZE); i++) {
+        ret = bdrv_pwrite(qcow_bs, header_size +
+            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
+        if (ret != BDRV_SECTOR_SIZE) {
+            g_free(tmp);
+            goto exit;
+        }
+    }
+
+    g_free(tmp);
+    ret = 0;
+exit:
+    bdrv_delete(qcow_bs);
+    return ret;
+}
+
+static int qcow_make_empty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+    int ret;
+
+    memset(s->l1_table, 0, l1_length);
+    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
+            l1_length) < 0)
+        return -1;
+    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
+    if (ret < 0)
+        return ret;
+
+    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+    return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                                 const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    z_stream strm;
+    int ret, out_len;
+    uint8_t *out_buf;
+    uint64_t cluster_offset;
+
+    if (nb_sectors != s->cluster_sectors) {
+        ret = -EINVAL;
+
+        /* Zero-pad last write if image size is not cluster aligned */
+        if (sector_num + nb_sectors == bs->total_sectors &&
+            nb_sectors < s->cluster_sectors) {
+            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+            memset(pad_buf, 0, s->cluster_size);
+            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+            ret = qcow_write_compressed(bs, sector_num,
+                                        pad_buf, s->cluster_sectors);
+            qemu_vfree(pad_buf);
+        }
+        return ret;
+    }
+
+    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+
+    /* best compression, small window, no zlib header */
+    memset(&strm, 0, sizeof(strm));
+    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                       Z_DEFLATED, -12,
+                       9, Z_DEFAULT_STRATEGY);
+    if (ret != 0) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    strm.avail_in = s->cluster_size;
+    strm.next_in = (uint8_t *)buf;
+    strm.avail_out = s->cluster_size;
+    strm.next_out = out_buf;
+
+    ret = deflate(&strm, Z_FINISH);
+    if (ret != Z_STREAM_END && ret != Z_OK) {
+        deflateEnd(&strm);
+        ret = -EINVAL;
+        goto fail;
+    }
+    out_len = strm.next_out - out_buf;
+
+    deflateEnd(&strm);
+
+    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+        /* could not compress: write normal cluster */
+        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+        if (ret < 0) {
+            goto fail;
+        }
+    } else {
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
+                                            out_len, 0, 0);
+        if (cluster_offset == 0) {
+            ret = -EIO;
+            goto fail;
+        }
+
+        cluster_offset &= s->cluster_offset_mask;
+        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    ret = 0;
+fail:
+    g_free(out_buf);
+    return ret;
+}
+
+static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdi->cluster_size = s->cluster_size;
+    return 0;
+}
+
+
+static QEMUOptionParameter qcow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_ENCRYPT,
+        .type = OPT_FLAG,
+        .help = "Encrypt the image"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_qcow = {
+    .format_name	= "qcow",
+    .instance_size	= sizeof(BDRVQcowState),
+    .bdrv_probe		= qcow_probe,
+    .bdrv_open		= qcow_open,
+    .bdrv_close		= qcow_close,
+    .bdrv_reopen_prepare = qcow_reopen_prepare,
+    .bdrv_create	= qcow_create,
+    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
+
+    .bdrv_co_readv          = qcow_co_readv,
+    .bdrv_co_writev         = qcow_co_writev,
+    .bdrv_co_is_allocated   = qcow_co_is_allocated,
+
+    .bdrv_set_key           = qcow_set_key,
+    .bdrv_make_empty        = qcow_make_empty,
+    .bdrv_write_compressed  = qcow_write_compressed,
+    .bdrv_get_info          = qcow_get_info,
+
+    .create_options = qcow_create_options,
+};
+
+static void bdrv_qcow_init(void)
+{
+    bdrv_register(&bdrv_qcow);
+}
+
+block_init(bdrv_qcow_init);
diff --git a/contrib/qemu/block/qcow2-cache.c b/contrib/qemu/block/qcow2-cache.c
new file mode 100644
index 000000000..2f3114ecc
--- /dev/null
+++ b/contrib/qemu/block/qcow2-cache.c
@@ -0,0 +1,323 @@
+/*
+ * L2/refcount table cache for the QCOW2 format
+ *
+ * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block/block_int.h"
+#include "qemu-common.h"
+#include "qcow2.h"
+#include "trace.h"
+
+typedef struct Qcow2CachedTable {
+    void*   table;
+    int64_t offset;
+    bool    dirty;
+    int     cache_hits;
+    int     ref;
+} Qcow2CachedTable;
+
+struct Qcow2Cache {
+    Qcow2CachedTable*       entries;
+    struct Qcow2Cache*      depends;
+    int                     size;
+    bool                    depends_on_flush;
+};
+
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2Cache *c;
+    int i;
+
+    c = g_malloc0(sizeof(*c));
+    c->size = num_tables;
+    c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
+
+    for (i = 0; i < c->size; i++) {
+        c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
+    }
+
+    return c;
+}
+
+int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        assert(c->entries[i].ref == 0);
+        qemu_vfree(c->entries[i].table);
+    }
+
+    g_free(c->entries);
+    g_free(c);
+
+    return 0;
+}
+
+static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c)
+{
+    int ret;
+
+    ret = qcow2_cache_flush(bs, c->depends);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->depends = NULL;
+    c->depends_on_flush = false;
+
+    return 0;
+}
+
+static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret = 0;
+
+    if (!c->entries[i].dirty || !c->entries[i].offset) {
+        return 0;
+    }
+
+    trace_qcow2_cache_entry_flush(qemu_coroutine_self(),
+                                  c == s->l2_table_cache, i);
+
+    if (c->depends) {
+        ret = qcow2_cache_flush_dependency(bs, c);
+    } else if (c->depends_on_flush) {
+        ret = bdrv_flush(bs->file);
+        if (ret >= 0) {
+            c->depends_on_flush = false;
+        }
+    }
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (c == s->refcount_block_cache) {
+        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
+    } else if (c == s->l2_table_cache) {
+        BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
+    }
+
+    ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
+        s->cluster_size);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->entries[i].dirty = false;
+
+    return 0;
+}
+
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+{
+    BDRVQcowState *s = bs->opaque;
+    int result = 0;
+    int ret;
+    int i;
+
+    trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache);
+
+    for (i = 0; i < c->size; i++) {
+        ret = qcow2_cache_entry_flush(bs, c, i);
+        if (ret < 0 && result != -ENOSPC) {
+            result = ret;
+        }
+    }
+
+    if (result == 0) {
+        ret = bdrv_flush(bs->file);
+        if (ret < 0) {
+            result = ret;
+        }
+    }
+
+    return result;
+}
+
+int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
+    Qcow2Cache *dependency)
+{
+    int ret;
+
+    if (dependency->depends) {
+        ret = qcow2_cache_flush_dependency(bs, dependency);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    if (c->depends && (c->depends != dependency)) {
+        ret = qcow2_cache_flush_dependency(bs, c);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    c->depends = dependency;
+    return 0;
+}
+
+void qcow2_cache_depends_on_flush(Qcow2Cache *c)
+{
+    c->depends_on_flush = true;
+}
+
+static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
+{
+    int i;
+    int min_count = INT_MAX;
+    int min_index = -1;
+
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].ref) {
+            continue;
+        }
+
+        if (c->entries[i].cache_hits < min_count) {
+            min_index = i;
+            min_count = c->entries[i].cache_hits;
+        }
+
+        /* Give newer hits priority */
+        /* TODO Check how to optimize the replacement strategy */
+        c->entries[i].cache_hits /= 2;
+    }
+
+    if (min_index == -1) {
+        /* This can't happen in current synchronous code, but leave the check
+         * here as a reminder for whoever starts using AIO with the cache */
+        abort();
+    }
+    return min_index;
+}
+
+static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
+    uint64_t offset, void **table, bool read_from_disk)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+    int ret;
+
+    trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache,
+                          offset, read_from_disk);
+
+    /* Check if the table is already cached */
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].offset == offset) {
+            goto found;
+        }
+    }
+
+    /* If not, write a table back and replace it */
+    i = qcow2_cache_find_entry_to_replace(c);
+    trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(),
+                                        c == s->l2_table_cache, i);
+    if (i < 0) {
+        return i;
+    }
+
+    ret = qcow2_cache_entry_flush(bs, c, i);
+    if (ret < 0) {
+        return ret;
+    }
+
+    trace_qcow2_cache_get_read(qemu_coroutine_self(),
+                               c == s->l2_table_cache, i);
+    c->entries[i].offset = 0;
+    if (read_from_disk) {
+        if (c == s->l2_table_cache) {
+            BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
+        }
+
+        ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    /* Give the table some hits for the start so that it won't be replaced
+     * immediately. The number 32 is completely arbitrary. */
+    c->entries[i].cache_hits = 32;
+    c->entries[i].offset = offset;
+
+    /* And return the right table */
+found:
+    c->entries[i].cache_hits++;
+    c->entries[i].ref++;
+    *table = c->entries[i].table;
+
+    trace_qcow2_cache_get_done(qemu_coroutine_self(),
+                               c == s->l2_table_cache, i);
+
+    return 0;
+}
+
+int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table)
+{
+    return qcow2_cache_do_get(bs, c, offset, table, true);
+}
+
+int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table)
+{
+    return qcow2_cache_do_get(bs, c, offset, table, false);
+}
+
+int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].table == *table) {
+            goto found;
+        }
+    }
+    return -ENOENT;
+
+found:
+    c->entries[i].ref--;
+    *table = NULL;
+
+    assert(c->entries[i].ref >= 0);
+    return 0;
+}
+
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].table == table) {
+            goto found;
+        }
+    }
+    abort();
+
+found:
+    c->entries[i].dirty = true;
+}
diff --git a/contrib/qemu/block/qcow2-cluster.c b/contrib/qemu/block/qcow2-cluster.c
new file mode 100644
index 000000000..cca76d4fc
--- /dev/null
+++ b/contrib/qemu/block/qcow2-cluster.c
@@ -0,0 +1,1478 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <zlib.h>
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/qcow2.h"
+#include "trace.h"
+
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+                        bool exact_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int new_l1_size2, ret, i;
+    uint64_t *new_l1_table;
+    int64_t new_l1_table_offset, new_l1_size;
+    uint8_t data[12];
+
+    if (min_size <= s->l1_size)
+        return 0;
+
+    if (exact_size) {
+        new_l1_size = min_size;
+    } else {
+        /* Bump size up to reduce the number of times we have to grow */
+        new_l1_size = s->l1_size;
+        if (new_l1_size == 0) {
+            new_l1_size = 1;
+        }
+        while (min_size > new_l1_size) {
+            new_l1_size = (new_l1_size * 3 + 1) / 2;
+        }
+    }
+
+    if (new_l1_size > INT_MAX) {
+        return -EFBIG;
+    }
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
+            s->l1_size, new_l1_size);
+#endif
+
+    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
+    new_l1_table = g_malloc0(align_offset(new_l1_size2, 512));
+    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
+
+    /* write new table (align to cluster) */
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
+    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
+    if (new_l1_table_offset < 0) {
+        g_free(new_l1_table);
+        return new_l1_table_offset;
+    }
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
+    for(i = 0; i < s->l1_size; i++)
+        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
+    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2);
+    if (ret < 0)
+        goto fail;
+    for(i = 0; i < s->l1_size; i++)
+        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
+
+    /* set new table */
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
+    cpu_to_be32w((uint32_t*)data, new_l1_size);
+    cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data));
+    if (ret < 0) {
+        goto fail;
+    }
+    g_free(s->l1_table);
+    qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_OTHER);
+    s->l1_table_offset = new_l1_table_offset;
+    s->l1_table = new_l1_table;
+    s->l1_size = new_l1_size;
+    return 0;
+ fail:
+    g_free(new_l1_table);
+    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
+                        QCOW2_DISCARD_OTHER);
+    return ret;
+}
+
+/*
+ * l2_load
+ *
+ * Loads a L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns a pointer to the L2 table on success, or NULL if the read from
+ * the image file failed.
+ */
+
+static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
+    uint64_t **l2_table)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
+
+    return ret;
+}
+
+/*
+ * Writes one sector of the L1 table to the disk (can't update single entries
+ * and we really don't want bdrv_pread to perform a read-modify-write)
+ */
+#define L1_ENTRIES_PER_SECTOR (512 / 8)
+static int write_l1_entry(BlockDriverState *bs, int l1_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t buf[L1_ENTRIES_PER_SECTOR];
+    int l1_start_index;
+    int i, ret;
+
+    l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
+    for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) {
+        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index,
+        buf, sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
+/*
+ * l2_allocate
+ *
+ * Allocate a new l2 entry in the file. If l1_index points to an already
+ * used entry in the L2 table (i.e. we are doing a copy on write for the L2
+ * table) copy the contents of the old L2 table into the newly allocated one.
+ * Otherwise the new table is initialized with zeros.
+ *
+ */
+
+static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t old_l2_offset;
+    uint64_t *l2_table;
+    int64_t l2_offset;
+    int ret;
+
+    old_l2_offset = s->l1_table[l1_index];
+
+    trace_qcow2_l2_allocate(bs, l1_index);
+
+    /* allocate a new l2 entry */
+
+    l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
+    if (l2_offset < 0) {
+        return l2_offset;
+    }
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* allocate a new entry in the l2 cache */
+
+    trace_qcow2_l2_allocate_get_empty(bs, l1_index);
+    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    l2_table = *table;
+
+    if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
+        /* if there was no old l2 table, clear the new table */
+        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+    } else {
+        uint64_t* old_table;
+
+        /* if there was an old l2 table, read it from the disk */
+        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
+        ret = qcow2_cache_get(bs, s->l2_table_cache,
+            old_l2_offset & L1E_OFFSET_MASK,
+            (void**) &old_table);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        memcpy(l2_table, old_table, s->cluster_size);
+
+        ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    /* write the l2 table to the file */
+    BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
+
+    trace_qcow2_l2_allocate_write_l2(bs, l1_index);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* update the L1 entry */
+    trace_qcow2_l2_allocate_write_l1(bs, l1_index);
+    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
+    ret = write_l1_entry(bs, l1_index);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    *table = l2_table;
+    trace_qcow2_l2_allocate_done(bs, l1_index, 0);
+    return 0;
+
+fail:
+    trace_qcow2_l2_allocate_done(bs, l1_index, ret);
+    qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
+    s->l1_table[l1_index] = old_l2_offset;
+    return ret;
+}
+
+/*
+ * Checks how many clusters in a given L2 table are contiguous in the image
+ * file. As soon as one of the flags in the bitmask stop_flags changes compared
+ * to the first cluster, the search is stopped and the cluster is not counted
+ * as contiguous. (This allows it, for example, to stop at the first compressed
+ * cluster which may require a different handling)
+ */
+static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
+        uint64_t *l2_table, uint64_t start, uint64_t stop_flags)
+{
+    int i;
+    uint64_t mask = stop_flags | L2E_OFFSET_MASK;
+    uint64_t offset = be64_to_cpu(l2_table[0]) & mask;
+
+    if (!offset)
+        return 0;
+
+    for (i = start; i < start + nb_clusters; i++) {
+        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
+        if (offset + (uint64_t) i * cluster_size != l2_entry) {
+            break;
+        }
+    }
+
+	return (i - start);
+}
+
+static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
+{
+    int i;
+
+    for (i = 0; i < nb_clusters; i++) {
+        int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
+
+        if (type != QCOW2_CLUSTER_UNALLOCATED) {
+            break;
+        }
+    }
+
+    return i;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+   supported */
+void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                           uint8_t *out_buf, const uint8_t *in_buf,
+                           int nb_sectors, int enc,
+                           const AES_KEY *key)
+{
+    union {
+        uint64_t ll[2];
+        uint8_t b[16];
+    } ivec;
+    int i;
+
+    for(i = 0; i < nb_sectors; i++) {
+        ivec.ll[0] = cpu_to_le64(sector_num);
+        ivec.ll[1] = 0;
+        AES_cbc_encrypt(in_buf, out_buf, 512, key,
+                        ivec.b, enc);
+        sector_num++;
+        in_buf += 512;
+        out_buf += 512;
+    }
+}
+
+static int coroutine_fn copy_sectors(BlockDriverState *bs,
+                                     uint64_t start_sect,
+                                     uint64_t cluster_offset,
+                                     int n_start, int n_end)
+{
+    BDRVQcowState *s = bs->opaque;
+    QEMUIOVector qiov;
+    struct iovec iov;
+    int n, ret;
+
+    /*
+     * If this is the last cluster and it is only partially used, we must only
+     * copy until the end of the image, or bdrv_check_request will fail for the
+     * bdrv_read/write calls below.
+     */
+    if (start_sect + n_end > bs->total_sectors) {
+        n_end = bs->total_sectors - start_sect;
+    }
+
+    n = n_end - n_start;
+    if (n <= 0) {
+        return 0;
+    }
+
+    iov.iov_len = n * BDRV_SECTOR_SIZE;
+    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
+
+    /* Call .bdrv_co_readv() directly instead of using the public block-layer
+     * interface.  This avoids double I/O throttling and request tracking,
+     * which can lead to deadlock when block layer copy-on-read is enabled.
+     */
+    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (s->crypt_method) {
+        qcow2_encrypt_sectors(s, start_sect + n_start,
+                        iov.iov_base, iov.iov_base, n, 1,
+                        &s->aes_encrypt_key);
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
+    ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = 0;
+out:
+    qemu_vfree(iov.iov_base);
+    return ret;
+}
+
+
+/*
+ * get_cluster_offset
+ *
+ * For a given offset of the disk image, find the cluster offset in
+ * qcow2 file. The offset is stored in *cluster_offset.
+ *
+ * on entry, *num is the number of contiguous sectors we'd like to
+ * access following offset.
+ *
+ * on exit, *num is the number of contiguous sectors we can read.
+ *
+ * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
+ * cases.
+ */
+int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+    int *num, uint64_t *cluster_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    unsigned int l2_index;
+    uint64_t l1_index, l2_offset, *l2_table;
+    int l1_bits, c;
+    unsigned int index_in_cluster, nb_clusters;
+    uint64_t nb_available, nb_needed;
+    int ret;
+
+    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
+    nb_needed = *num + index_in_cluster;
+
+    l1_bits = s->l2_bits + s->cluster_bits;
+
+    /* compute how many bytes there are between the offset and
+     * the end of the l1 entry
+     */
+
+    nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
+
+    /* compute the number of available sectors */
+
+    nb_available = (nb_available >> 9) + index_in_cluster;
+
+    if (nb_needed > nb_available) {
+        nb_needed = nb_available;
+    }
+
+    *cluster_offset = 0;
+
+    /* seek the the l2 offset in the l1 table */
+
+    l1_index = offset >> l1_bits;
+    if (l1_index >= s->l1_size) {
+        ret = QCOW2_CLUSTER_UNALLOCATED;
+        goto out;
+    }
+
+    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+    if (!l2_offset) {
+        ret = QCOW2_CLUSTER_UNALLOCATED;
+        goto out;
+    }
+
+    /* load the l2 table in memory */
+
+    ret = l2_load(bs, l2_offset, &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* find the cluster offset for the given disk offset */
+
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    *cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    nb_clusters = size_to_clusters(s, nb_needed << 9);
+
+    ret = qcow2_get_cluster_type(*cluster_offset);
+    switch (ret) {
+    case QCOW2_CLUSTER_COMPRESSED:
+        /* Compressed clusters can only be processed one by one */
+        c = 1;
+        *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
+        break;
+    case QCOW2_CLUSTER_ZERO:
+        if (s->qcow_version < 3) {
+            return -EIO;
+        }
+        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+                &l2_table[l2_index], 0,
+                QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
+        *cluster_offset = 0;
+        break;
+    case QCOW2_CLUSTER_UNALLOCATED:
+        /* how many empty clusters ? */
+        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
+        *cluster_offset = 0;
+        break;
+    case QCOW2_CLUSTER_NORMAL:
+        /* how many allocated clusters ? */
+        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+                &l2_table[l2_index], 0,
+                QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
+        *cluster_offset &= L2E_OFFSET_MASK;
+        break;
+    default:
+        abort();
+    }
+
+    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+
+    nb_available = (c * s->cluster_sectors);
+
+out:
+    if (nb_available > nb_needed)
+        nb_available = nb_needed;
+
+    *num = nb_available - index_in_cluster;
+
+    return ret;
+}
+
+/*
+ * get_cluster_table
+ *
+ * for a given disk offset, load (and allocate if needed)
+ * the l2 table.
+ *
+ * the l2 table offset in the qcow2 file and the cluster index
+ * in the l2 table are given to the caller.
+ *
+ * Returns 0 on success, -errno in failure case
+ */
+static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
+                             uint64_t **new_l2_table,
+                             int *new_l2_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    unsigned int l2_index;
+    uint64_t l1_index, l2_offset;
+    uint64_t *l2_table = NULL;
+    int ret;
+
+    /* seek the the l2 offset in the l1 table */
+
+    l1_index = offset >> (s->l2_bits + s->cluster_bits);
+    if (l1_index >= s->l1_size) {
+        ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    assert(l1_index < s->l1_size);
+    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+
+    /* seek the l2 table of the given l2 offset */
+
+    if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
+        /* load the l2 table in memory */
+        ret = l2_load(bs, l2_offset, &l2_table);
+        if (ret < 0) {
+            return ret;
+        }
+    } else {
+        /* First allocate a new L2 table (and do COW if needed) */
+        ret = l2_allocate(bs, l1_index, &l2_table);
+        if (ret < 0) {
+            return ret;
+        }
+
+        /* Then decrease the refcount of the old table */
+        if (l2_offset) {
+            qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
+                                QCOW2_DISCARD_OTHER);
+        }
+    }
+
+    /* find the cluster offset for the given disk offset */
+
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+
+    *new_l2_table = l2_table;
+    *new_l2_index = l2_index;
+
+    return 0;
+}
+
+/*
+ * alloc_compressed_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * If the offset is not found, allocate a new compressed cluster.
+ *
+ * Return the cluster offset if successful,
+ * Return 0, otherwise.
+ *
+ */
+
+uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+                                               uint64_t offset,
+                                               int compressed_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index, ret;
+    uint64_t *l2_table;
+    int64_t cluster_offset;
+    int nb_csectors;
+
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return 0;
+    }
+
+    /* Compression can't overwrite anything. Fail if the cluster was already
+     * allocated. */
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    if (cluster_offset & L2E_OFFSET_MASK) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+        return 0;
+    }
+
+    cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
+    if (cluster_offset < 0) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+        return 0;
+    }
+
+    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
+                  (cluster_offset >> 9);
+
+    cluster_offset |= QCOW_OFLAG_COMPRESSED |
+                      ((uint64_t)nb_csectors << s->csize_shift);
+
+    /* update L2 table */
+
+    /* compressed clusters never have the copied flag */
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+    l2_table[l2_index] = cpu_to_be64(cluster_offset);
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return 0;
+    }
+
+    return cluster_offset;
+}
+
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    if (r->nb_sectors == 0) {
+        return 0;
+    }
+
+    qemu_co_mutex_unlock(&s->lock);
+    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
+                       r->offset / BDRV_SECTOR_SIZE,
+                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
+    qemu_co_mutex_lock(&s->lock);
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    /*
+     * Before we update the L2 table to actually point to the new cluster, we
+     * need to be sure that the refcounts have been increased and COW was
+     * handled.
+     */
+    qcow2_cache_depends_on_flush(s->l2_table_cache);
+
+    return 0;
+}
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, j = 0, l2_index, ret;
+    uint64_t *old_cluster, *l2_table;
+    uint64_t cluster_offset = m->alloc_offset;
+
+    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
+    assert(m->nb_clusters > 0);
+
+    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+
+    /* copy content of unmodified sectors */
+    ret = perform_cow(bs, m, &m->cow_start);
+    if (ret < 0) {
+        goto err;
+    }
+
+    ret = perform_cow(bs, m, &m->cow_end);
+    if (ret < 0) {
+        goto err;
+    }
+
+    /* Update L2 table. */
+    if (s->use_lazy_refcounts) {
+        qcow2_mark_dirty(bs);
+    }
+    if (qcow2_need_accurate_refcounts(s)) {
+        qcow2_cache_set_dependency(bs, s->l2_table_cache,
+                                   s->refcount_block_cache);
+    }
+
+    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        goto err;
+    }
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+
+    for (i = 0; i < m->nb_clusters; i++) {
+        /* if two concurrent writes happen to the same unallocated cluster
+	 * each write allocates separate cluster and writes data concurrently.
+	 * The first one to complete updates l2 table with pointer to its
+	 * cluster the second one has to do RMW (which is done above by
+	 * copy_sectors()), update l2 table with its cluster pointer and free
+	 * old cluster. This is what this loop does */
+        if(l2_table[l2_index + i] != 0)
+            old_cluster[j++] = l2_table[l2_index + i];
+
+        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
+                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
+     }
+
+
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        goto err;
+    }
+
+    /*
+     * If this was a COW, we need to decrease the refcount of the old cluster.
+     * Also flush bs->file to get the right order for L2 and refcount update.
+     *
+     * Don't discard clusters that reach a refcount of 0 (e.g. compressed
+     * clusters), the next write will reuse them anyway.
+     */
+    if (j != 0) {
+        for (i = 0; i < j; i++) {
+            qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
+                                    QCOW2_DISCARD_NEVER);
+        }
+    }
+
+    ret = 0;
+err:
+    g_free(old_cluster);
+    return ret;
+ }
+
+/*
+ * Returns the number of contiguous clusters that can be used for an allocating
+ * write, but require COW to be performed (this includes yet unallocated space,
+ * which must copy from the backing file)
+ */
+static int count_cow_clusters(BDRVQcowState *s, int nb_clusters,
+    uint64_t *l2_table, int l2_index)
+{
+    int i;
+
+    for (i = 0; i < nb_clusters; i++) {
+        uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
+        int cluster_type = qcow2_get_cluster_type(l2_entry);
+
+        switch(cluster_type) {
+        case QCOW2_CLUSTER_NORMAL:
+            if (l2_entry & QCOW_OFLAG_COPIED) {
+                goto out;
+            }
+            break;
+        case QCOW2_CLUSTER_UNALLOCATED:
+        case QCOW2_CLUSTER_COMPRESSED:
+        case QCOW2_CLUSTER_ZERO:
+            break;
+        default:
+            abort();
+        }
+    }
+
+out:
+    assert(i <= nb_clusters);
+    return i;
+}
+
+/*
+ * Check if there already is an AIO write request in flight which allocates
+ * the same cluster. In this case we need to wait until the previous
+ * request has completed and updated the L2 table accordingly.
+ *
+ * Returns:
+ *   0       if there was no dependency. *cur_bytes indicates the number of
+ *           bytes from guest_offset that can be read before the next
+ *           dependency must be processed (or the request is complete)
+ *
+ *   -EAGAIN if we had to wait for another request, previously gathered
+ *           information on cluster allocation may be invalid now. The caller
+ *           must start over anyway, so consider *cur_bytes undefined.
+ */
+static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *cur_bytes, QCowL2Meta **m)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowL2Meta *old_alloc;
+    uint64_t bytes = *cur_bytes;
+
+    QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
+
+        uint64_t start = guest_offset;
+        uint64_t end = start + bytes;
+        uint64_t old_start = l2meta_cow_start(old_alloc);
+        uint64_t old_end = l2meta_cow_end(old_alloc);
+
+        if (end <= old_start || start >= old_end) {
+            /* No intersection */
+        } else {
+            if (start < old_start) {
+                /* Stop at the start of a running allocation */
+                bytes = old_start - start;
+            } else {
+                bytes = 0;
+            }
+
+            /* Stop if already an l2meta exists. After yielding, it wouldn't
+             * be valid any more, so we'd have to clean up the old L2Metas
+             * and deal with requests depending on them before starting to
+             * gather new ones. Not worth the trouble. */
+            if (bytes == 0 && *m) {
+                *cur_bytes = 0;
+                return 0;
+            }
+
+            if (bytes == 0) {
+                /* Wait for the dependency to complete. We need to recheck
+                 * the free/allocated clusters when we continue. */
+                qemu_co_mutex_unlock(&s->lock);
+                qemu_co_queue_wait(&old_alloc->dependent_requests);
+                qemu_co_mutex_lock(&s->lock);
+                return -EAGAIN;
+            }
+        }
+    }
+
+    /* Make sure that existing clusters and new allocations are only used up to
+     * the next dependency if we shortened the request above */
+    *cur_bytes = bytes;
+
+    return 0;
+}
+
+/*
+ * Checks how many already allocated clusters that don't require a copy on
+ * write there are at the given guest_offset (up to *bytes). If
+ * *host_offset is not zero, only physically contiguous clusters beginning at
+ * this host offset are counted.
+ *
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
+ *
+ * Returns:
+ *   0:     if no allocated clusters are available at the given offset.
+ *          *bytes is normally unchanged. It is set to 0 if the cluster
+ *          is allocated and doesn't need COW, but doesn't have the right
+ *          physical offset.
+ *
+ *   1:     if allocated clusters that don't require a COW are available at
+ *          the requested offset. *bytes may have decreased and describes
+ *          the length of the area that can be written to.
+ *
+ *  -errno: in error cases
+ */
+static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index;
+    uint64_t cluster_offset;
+    uint64_t *l2_table;
+    unsigned int nb_clusters;
+    unsigned int keep_clusters;
+    int ret, pret;
+
+    trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
+                              *bytes);
+
+    assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset)
+                                == offset_into_cluster(s, *host_offset));
+
+    /*
+     * Calculate the number of clusters to look for. We stop at L2 table
+     * boundaries to keep things simple.
+     */
+    nb_clusters =
+        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
+    l2_index = offset_to_l2_index(s, guest_offset);
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    /* Find L2 entry for the first involved cluster */
+    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+    /* Check how many clusters are already allocated and don't need COW */
+    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
+        && (cluster_offset & QCOW_OFLAG_COPIED))
+    {
+        /* If a specific host_offset is required, check it */
+        bool offset_matches =
+            (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
+
+        if (*host_offset != 0 && !offset_matches) {
+            *bytes = 0;
+            ret = 0;
+            goto out;
+        }
+
+        /* We keep all QCOW_OFLAG_COPIED clusters */
+        keep_clusters =
+            count_contiguous_clusters(nb_clusters, s->cluster_size,
+                                      &l2_table[l2_index], 0,
+                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
+        assert(keep_clusters <= nb_clusters);
+
+        *bytes = MIN(*bytes,
+                 keep_clusters * s->cluster_size
+                 - offset_into_cluster(s, guest_offset));
+
+        ret = 1;
+    } else {
+        ret = 0;
+    }
+
+    /* Cleanup */
+out:
+    pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (pret < 0) {
+        return pret;
+    }
+
+    /* Only return a host offset if we actually made progress. Otherwise we
+     * would make requirements for handle_alloc() that it can't fulfill */
+    if (ret) {
+        *host_offset = (cluster_offset & L2E_OFFSET_MASK)
+                     + offset_into_cluster(s, guest_offset);
+    }
+
+    return ret;
+}
+
+/*
+ * Allocates new clusters for the given guest_offset.
+ *
+ * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
+ * contain the number of clusters that have been allocated and are contiguous
+ * in the image file.
+ *
+ * If *host_offset is non-zero, it specifies the offset in the image file at
+ * which the new clusters must start. *nb_clusters can be 0 on return in this
+ * case if the cluster at host_offset is already in use. If *host_offset is
+ * zero, the clusters can be allocated anywhere in the image file.
+ *
+ * *host_offset is updated to contain the offset into the image file at which
+ * the first allocated cluster starts.
+ *
+ * Return 0 on success and -errno in error cases. -EAGAIN means that the
+ * function has been waiting for another request and the allocation must be
+ * restarted, but the whole request should not be failed.
+ */
+static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, unsigned int *nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
+                                         *host_offset, *nb_clusters);
+
+    /* Allocate new clusters */
+    trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
+    if (*host_offset == 0) {
+        int64_t cluster_offset =
+            qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
+        if (cluster_offset < 0) {
+            return cluster_offset;
+        }
+        *host_offset = cluster_offset;
+        return 0;
+    } else {
+        int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
+        if (ret < 0) {
+            return ret;
+        }
+        *nb_clusters = ret;
+        return 0;
+    }
+}
+
+/*
+ * Allocates new clusters for an area that either is yet unallocated or needs a
+ * copy on write. If *host_offset is non-zero, clusters are only allocated if
+ * the new allocation can match the specified host offset.
+ *
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
+ *
+ * Returns:
+ *   0:     if no clusters could be allocated. *bytes is set to 0,
+ *          *host_offset is left unchanged.
+ *
+ *   1:     if new clusters were allocated. *bytes may be decreased if the
+ *          new allocation doesn't cover all of the requested area.
+ *          *host_offset is updated to contain the host offset of the first
+ *          newly allocated cluster.
+ *
+ *  -errno: in error cases
+ */
+static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index;
+    uint64_t *l2_table;
+    uint64_t entry;
+    unsigned int nb_clusters;
+    int ret;
+
+    uint64_t alloc_cluster_offset;
+
+    trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
+                             *bytes);
+    assert(*bytes > 0);
+
+    /*
+     * Calculate the number of clusters to look for. We stop at L2 table
+     * boundaries to keep things simple.
+     */
+    nb_clusters =
+        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
+    l2_index = offset_to_l2_index(s, guest_offset);
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    /* Find L2 entry for the first involved cluster */
+    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    entry = be64_to_cpu(l2_table[l2_index]);
+
+    /* For the moment, overwrite compressed clusters one by one */
+    if (entry & QCOW_OFLAG_COMPRESSED) {
+        nb_clusters = 1;
+    } else {
+        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
+    }
+
+    /* This function is only called when there were no non-COW clusters, so if
+     * we can't find any unallocated or COW clusters either, something is
+     * wrong with our code. */
+    assert(nb_clusters > 0);
+
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Allocate, if necessary at a given offset in the image file */
+    alloc_cluster_offset = start_of_cluster(s, *host_offset);
+    ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
+                                  &nb_clusters);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Can't extend contiguous allocation */
+    if (nb_clusters == 0) {
+        *bytes = 0;
+        return 0;
+    }
+
+    /*
+     * Save info needed for meta data update.
+     *
+     * requested_sectors: Number of sectors from the start of the first
+     * newly allocated cluster to the end of the (possibly shortened
+     * before) write request.
+     *
+     * avail_sectors: Number of sectors from the start of the first
+     * newly allocated to the end of the last newly allocated cluster.
+     *
+     * nb_sectors: The number of sectors from the start of the first
+     * newly allocated cluster to the end of the area that the write
+     * request actually writes to (excluding COW at the end)
+     */
+    int requested_sectors =
+        (*bytes + offset_into_cluster(s, guest_offset))
+        >> BDRV_SECTOR_BITS;
+    int avail_sectors = nb_clusters
+                        << (s->cluster_bits - BDRV_SECTOR_BITS);
+    int alloc_n_start = offset_into_cluster(s, guest_offset)
+                        >> BDRV_SECTOR_BITS;
+    int nb_sectors = MIN(requested_sectors, avail_sectors);
+    QCowL2Meta *old_m = *m;
+
+    *m = g_malloc0(sizeof(**m));
+
+    **m = (QCowL2Meta) {
+        .next           = old_m,
+
+        .alloc_offset   = alloc_cluster_offset,
+        .offset         = start_of_cluster(s, guest_offset),
+        .nb_clusters    = nb_clusters,
+        .nb_available   = nb_sectors,
+
+        .cow_start = {
+            .offset     = 0,
+            .nb_sectors = alloc_n_start,
+        },
+        .cow_end = {
+            .offset     = nb_sectors * BDRV_SECTOR_SIZE,
+            .nb_sectors = avail_sectors - nb_sectors,
+        },
+    };
+    qemu_co_queue_init(&(*m)->dependent_requests);
+    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
+
+    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
+    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
+                         - offset_into_cluster(s, guest_offset));
+    assert(*bytes != 0);
+
+    return 1;
+
+fail:
+    if (*m && (*m)->nb_clusters > 0) {
+        QLIST_REMOVE(*m, next_in_flight);
+    }
+    return ret;
+}
+
+/*
+ * alloc_cluster_offset
+ *
+ * For a given offset on the virtual disk, find the cluster offset in qcow2
+ * file. If the offset is not found, allocate a new cluster.
+ *
+ * If the cluster was already allocated, m->nb_clusters is set to 0 and
+ * other fields in m are meaningless.
+ *
+ * If the cluster is newly allocated, m->nb_clusters is set to the number of
+ * contiguous clusters that have been allocated. In this case, the other
+ * fields of m are valid and contain information about the first allocated
+ * cluster.
+ *
+ * If the request conflicts with another write request in flight, the coroutine
+ * is queued and will be reentered when the dependency has completed.
+ *
+ * Return 0 on success and -errno in error cases
+ */
+int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
+    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t start, remaining;
+    uint64_t cluster_offset;
+    uint64_t cur_bytes;
+    int ret;
+
+    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
+                                      n_start, n_end);
+
+    assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset));
+    offset = start_of_cluster(s, offset);
+
+again:
+    start = offset + (n_start << BDRV_SECTOR_BITS);
+    remaining = (n_end - n_start) << BDRV_SECTOR_BITS;
+    cluster_offset = 0;
+    *host_offset = 0;
+    cur_bytes = 0;
+    *m = NULL;
+
+    while (true) {
+
+        if (!*host_offset) {
+            *host_offset = start_of_cluster(s, cluster_offset);
+        }
+
+        assert(remaining >= cur_bytes);
+
+        start           += cur_bytes;
+        remaining       -= cur_bytes;
+        cluster_offset  += cur_bytes;
+
+        if (remaining == 0) {
+            break;
+        }
+
+        cur_bytes = remaining;
+
+        /*
+         * Now start gathering as many contiguous clusters as possible:
+         *
+         * 1. Check for overlaps with in-flight allocations
+         *
+         *      a) Overlap not in the first cluster -> shorten this request and
+         *         let the caller handle the rest in its next loop iteration.
+         *
+         *      b) Real overlaps of two requests. Yield and restart the search
+         *         for contiguous clusters (the situation could have changed
+         *         while we were sleeping)
+         *
+         *      c) TODO: Request starts in the same cluster as the in-flight
+         *         allocation ends. Shorten the COW of the in-fight allocation,
+         *         set cluster_offset to write to the same cluster and set up
+         *         the right synchronisation between the in-flight request and
+         *         the new one.
+         */
+        ret = handle_dependencies(bs, start, &cur_bytes, m);
+        if (ret == -EAGAIN) {
+            /* Currently handle_dependencies() doesn't yield if we already had
+             * an allocation. If it did, we would have to clean up the L2Meta
+             * structs before starting over. */
+            assert(*m == NULL);
+            goto again;
+        } else if (ret < 0) {
+            return ret;
+        } else if (cur_bytes == 0) {
+            break;
+        } else {
+            /* handle_dependencies() may have decreased cur_bytes (shortened
+             * the allocations below) so that the next dependency is processed
+             * correctly during the next loop iteration. */
+        }
+
+        /*
+         * 2. Count contiguous COPIED clusters.
+         */
+        ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            continue;
+        } else if (cur_bytes == 0) {
+            break;
+        }
+
+        /*
+         * 3. If the request still hasn't completed, allocate new clusters,
+         *    considering any cluster_offset of steps 1c or 2.
+         */
+        ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            continue;
+        } else {
+            assert(cur_bytes == 0);
+            break;
+        }
+    }
+
+    *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS);
+    assert(*num > 0);
+    assert(*host_offset != 0);
+
+    return 0;
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+    z_stream strm1, *strm = &strm1;
+    int ret, out_len;
+
+    memset(strm, 0, sizeof(*strm));
+
+    strm->next_in = (uint8_t *)buf;
+    strm->avail_in = buf_size;
+    strm->next_out = out_buf;
+    strm->avail_out = out_buf_size;
+
+    ret = inflateInit2(strm, -12);
+    if (ret != Z_OK)
+        return -1;
+    ret = inflate(strm, Z_FINISH);
+    out_len = strm->next_out - out_buf;
+    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+        out_len != out_buf_size) {
+        inflateEnd(strm);
+        return -1;
+    }
+    inflateEnd(strm);
+    return 0;
+}
+
+int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, csize, nb_csectors, sector_offset;
+    uint64_t coffset;
+
+    coffset = cluster_offset & s->cluster_offset_mask;
+    if (s->cluster_cache_offset != coffset) {
+        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
+        sector_offset = coffset & 511;
+        csize = nb_csectors * 512 - sector_offset;
+        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
+        ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors);
+        if (ret < 0) {
+            return ret;
+        }
+        if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                              s->cluster_data + sector_offset, csize) < 0) {
+            return -EIO;
+        }
+        s->cluster_cache_offset = coffset;
+    }
+    return 0;
+}
+
+/*
+ * This discards as many clusters of nb_clusters as possible at once (i.e.
+ * all clusters in the same L2 table) and returns the number of discarded
+ * clusters.
+ */
+static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
+    unsigned int nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l2_table;
+    int l2_index;
+    int ret;
+    int i;
+
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Limit nb_clusters to one L2 table */
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    for (i = 0; i < nb_clusters; i++) {
+        uint64_t old_offset;
+
+        old_offset = be64_to_cpu(l2_table[l2_index + i]);
+        if ((old_offset & L2E_OFFSET_MASK) == 0) {
+            continue;
+        }
+
+        /* First remove L2 entries */
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+        l2_table[l2_index + i] = cpu_to_be64(0);
+
+        /* Then decrease the refcount */
+        qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
+    }
+
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return nb_clusters;
+}
+
+int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
+    int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t end_offset;
+    unsigned int nb_clusters;
+    int ret;
+
+    end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
+
+    /* Round start up and end down */
+    offset = align_offset(offset, s->cluster_size);
+    end_offset &= ~(s->cluster_size - 1);
+
+    if (offset > end_offset) {
+        return 0;
+    }
+
+    nb_clusters = size_to_clusters(s, end_offset - offset);
+
+    s->cache_discards = true;
+
+    /* Each L2 table is handled by its own loop iteration */
+    while (nb_clusters > 0) {
+        ret = discard_single_l2(bs, offset, nb_clusters);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        nb_clusters -= ret;
+        offset += (ret * s->cluster_size);
+    }
+
+    ret = 0;
+fail:
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
+    return ret;
+}
+
+/*
+ * This zeroes as many clusters of nb_clusters as possible at once (i.e.
+ * all clusters in the same L2 table) and returns the number of zeroed
+ * clusters.
+ */
+static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
+    unsigned int nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l2_table;
+    int l2_index;
+    int ret;
+    int i;
+
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Limit nb_clusters to one L2 table */
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    for (i = 0; i < nb_clusters; i++) {
+        uint64_t old_offset;
+
+        old_offset = be64_to_cpu(l2_table[l2_index + i]);
+
+        /* Update L2 entries */
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+        if (old_offset & QCOW_OFLAG_COMPRESSED) {
+            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
+            qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
+        } else {
+            l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
+        }
+    }
+
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return nb_clusters;
+}
+
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    unsigned int nb_clusters;
+    int ret;
+
+    /* The zero flag is only supported by version 3 and newer */
+    if (s->qcow_version < 3) {
+        return -ENOTSUP;
+    }
+
+    /* Each L2 table is handled by its own loop iteration */
+    nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
+
+    s->cache_discards = true;
+
+    while (nb_clusters > 0) {
+        ret = zero_single_l2(bs, offset, nb_clusters);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        nb_clusters -= ret;
+        offset += (ret * s->cluster_size);
+    }
+
+    ret = 0;
+fail:
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
+    return ret;
+}
diff --git a/contrib/qemu/block/qcow2-refcount.c b/contrib/qemu/block/qcow2-refcount.c
new file mode 100644
index 000000000..1244693f3
--- /dev/null
+++ b/contrib/qemu/block/qcow2-refcount.c
@@ -0,0 +1,1374 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/qcow2.h"
+
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
+static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
+                            int64_t offset, int64_t length,
+                            int addend, enum qcow2_discard_type type);
+
+
+/*********************************************************/
+/* refcount handling */
+
+int qcow2_refcount_init(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, refcount_table_size2, i;
+
+    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
+    s->refcount_table = g_malloc(refcount_table_size2);
+    if (s->refcount_table_size > 0) {
+        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
+        ret = bdrv_pread(bs->file, s->refcount_table_offset,
+                         s->refcount_table, refcount_table_size2);
+        if (ret != refcount_table_size2)
+            goto fail;
+        for(i = 0; i < s->refcount_table_size; i++)
+            be64_to_cpus(&s->refcount_table[i]);
+    }
+    return 0;
+ fail:
+    return -ENOMEM;
+}
+
+void qcow2_refcount_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    g_free(s->refcount_table);
+}
+
+
+static int load_refcount_block(BlockDriverState *bs,
+                               int64_t refcount_block_offset,
+                               void **refcount_block)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+        refcount_block);
+
+    return ret;
+}
+
+/*
+ * Returns the refcount of the cluster given by its index. Any non-negative
+ * return value is the refcount of the cluster, negative values are -errno
+ * and indicate an error.
+ */
+static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    int refcount_table_index, block_index;
+    int64_t refcount_block_offset;
+    int ret;
+    uint16_t *refcount_block;
+    uint16_t refcount;
+
+    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+    if (refcount_table_index >= s->refcount_table_size)
+        return 0;
+    refcount_block_offset = s->refcount_table[refcount_table_index];
+    if (!refcount_block_offset)
+        return 0;
+
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+        (void**) &refcount_block);
+    if (ret < 0) {
+        return ret;
+    }
+
+    block_index = cluster_index &
+        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+    refcount = be16_to_cpu(refcount_block[block_index]);
+
+    ret = qcow2_cache_put(bs, s->refcount_block_cache,
+        (void**) &refcount_block);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return refcount;
+}
+
+/*
+ * Rounds the refcount table size up to avoid growing the table for each single
+ * refcount block that is allocated.
+ */
+static unsigned int next_refcount_table_size(BDRVQcowState *s,
+    unsigned int min_size)
+{
+    unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
+    unsigned int refcount_table_clusters =
+        MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
+
+    while (min_clusters > refcount_table_clusters) {
+        refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
+    }
+
+    return refcount_table_clusters << (s->cluster_bits - 3);
+}
+
+
+/* Checks if two offsets are described by the same refcount block */
+static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
+    uint64_t offset_b)
+{
+    uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
+    uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
+
+    return (block_a == block_b);
+}
+
+/*
+ * Loads a refcount block. If it doesn't exist yet, it is allocated first
+ * (including growing the refcount table if needed).
+ *
+ * Returns 0 on success or -errno in error case
+ */
+static int alloc_refcount_block(BlockDriverState *bs,
+    int64_t cluster_index, uint16_t **refcount_block)
+{
+    BDRVQcowState *s = bs->opaque;
+    unsigned int refcount_table_index;
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
+
+    /* Find the refcount block for the given cluster */
+    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+
+    if (refcount_table_index < s->refcount_table_size) {
+
+        uint64_t refcount_block_offset =
+            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
+
+        /* If it's already there, we're done */
+        if (refcount_block_offset) {
+             return load_refcount_block(bs, refcount_block_offset,
+                 (void**) refcount_block);
+        }
+    }
+
+    /*
+     * If we came here, we need to allocate something. Something is at least
+     * a cluster for the new refcount block. It may also include a new refcount
+     * table if the old refcount table is too small.
+     *
+     * Note that allocating clusters here needs some special care:
+     *
+     * - We can't use the normal qcow2_alloc_clusters(), it would try to
+     *   increase the refcount and very likely we would end up with an endless
+     *   recursion. Instead we must place the refcount blocks in a way that
+     *   they can describe them themselves.
+     *
+     * - We need to consider that at this point we are inside update_refcounts
+     *   and doing the initial refcount increase. This means that some clusters
+     *   have already been allocated by the caller, but their refcount isn't
+     *   accurate yet. free_cluster_index tells us where this allocation ends
+     *   as long as we don't overwrite it by freeing clusters.
+     *
+     * - alloc_clusters_noref and qcow2_free_clusters may load a different
+     *   refcount block into the cache
+     */
+
+    *refcount_block = NULL;
+
+    /* We write to the refcount table, so we might depend on L2 tables */
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Allocate the refcount block itself and mark it as used */
+    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
+    if (new_block < 0) {
+        return new_block;
+    }
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
+        " at %" PRIx64 "\n",
+        refcount_table_index, cluster_index << s->cluster_bits, new_block);
+#endif
+
+    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
+        /* Zero the new refcount block before updating it */
+        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+            (void**) refcount_block);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        memset(*refcount_block, 0, s->cluster_size);
+
+        /* The block describes itself, need to update the cache */
+        int block_index = (new_block >> s->cluster_bits) &
+            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+        (*refcount_block)[block_index] = cpu_to_be16(1);
+    } else {
+        /* Described somewhere else. This can recurse at most twice before we
+         * arrive at a block that describes itself. */
+        ret = update_refcount(bs, new_block, s->cluster_size, 1,
+                              QCOW2_DISCARD_NEVER);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        /* Initialize the new refcount block only after updating its refcount,
+         * update_refcount uses the refcount cache itself */
+        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+            (void**) refcount_block);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        memset(*refcount_block, 0, s->cluster_size);
+    }
+
+    /* Now the new refcount block needs to be written to disk */
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
+    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail_block;
+    }
+
+    /* If the refcount table is big enough, just hook the block up there */
+    if (refcount_table_index < s->refcount_table_size) {
+        uint64_t data64 = cpu_to_be64(new_block);
+        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
+        ret = bdrv_pwrite_sync(bs->file,
+            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
+            &data64, sizeof(data64));
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        s->refcount_table[refcount_table_index] = new_block;
+        return 0;
+    }
+
+    ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+    if (ret < 0) {
+        goto fail_block;
+    }
+
+    /*
+     * If we come here, we need to grow the refcount table. Again, a new
+     * refcount table needs some space and we can't simply allocate to avoid
+     * endless recursion.
+     *
+     * Therefore let's grab new refcount blocks at the end of the image, which
+     * will describe themselves and the new refcount table. This way we can
+     * reference them only in the new table and do the switch to the new
+     * refcount table at once without producing an inconsistent state in
+     * between.
+     */
+    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
+
+    /* Calculate the number of refcount blocks needed so far */
+    uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
+    uint64_t blocks_used = (s->free_cluster_index +
+        refcount_block_clusters - 1) / refcount_block_clusters;
+
+    /* And now we need at least one block more for the new metadata */
+    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
+    uint64_t last_table_size;
+    uint64_t blocks_clusters;
+    do {
+        uint64_t table_clusters =
+            size_to_clusters(s, table_size * sizeof(uint64_t));
+        blocks_clusters = 1 +
+            ((table_clusters + refcount_block_clusters - 1)
+            / refcount_block_clusters);
+        uint64_t meta_clusters = table_clusters + blocks_clusters;
+
+        last_table_size = table_size;
+        table_size = next_refcount_table_size(s, blocks_used +
+            ((meta_clusters + refcount_block_clusters - 1)
+            / refcount_block_clusters));
+
+    } while (last_table_size != table_size);
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
+        s->refcount_table_size, table_size);
+#endif
+
+    /* Create the new refcount table and blocks */
+    uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
+        s->cluster_size;
+    uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
+    uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
+    uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
+
+    assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
+
+    /* Fill the new refcount table */
+    memcpy(new_table, s->refcount_table,
+        s->refcount_table_size * sizeof(uint64_t));
+    new_table[refcount_table_index] = new_block;
+
+    int i;
+    for (i = 0; i < blocks_clusters; i++) {
+        new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
+    }
+
+    /* Fill the refcount blocks */
+    uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
+    int block = 0;
+    for (i = 0; i < table_clusters + blocks_clusters; i++) {
+        new_blocks[block++] = cpu_to_be16(1);
+    }
+
+    /* Write refcount blocks to disk */
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
+    ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
+        blocks_clusters * s->cluster_size);
+    g_free(new_blocks);
+    if (ret < 0) {
+        goto fail_table;
+    }
+
+    /* Write refcount table to disk */
+    for(i = 0; i < table_size; i++) {
+        cpu_to_be64s(&new_table[i]);
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
+    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
+        table_size * sizeof(uint64_t));
+    if (ret < 0) {
+        goto fail_table;
+    }
+
+    for(i = 0; i < table_size; i++) {
+        be64_to_cpus(&new_table[i]);
+    }
+
+    /* Hook up the new refcount table in the qcow2 header */
+    uint8_t data[12];
+    cpu_to_be64w((uint64_t*)data, table_offset);
+    cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
+        data, sizeof(data));
+    if (ret < 0) {
+        goto fail_table;
+    }
+
+    /* And switch it in memory */
+    uint64_t old_table_offset = s->refcount_table_offset;
+    uint64_t old_table_size = s->refcount_table_size;
+
+    g_free(s->refcount_table);
+    s->refcount_table = new_table;
+    s->refcount_table_size = table_size;
+    s->refcount_table_offset = table_offset;
+
+    /* Free old table. Remember, we must not change free_cluster_index */
+    uint64_t old_free_cluster_index = s->free_cluster_index;
+    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_OTHER);
+    s->free_cluster_index = old_free_cluster_index;
+
+    ret = load_refcount_block(bs, new_block, (void**) refcount_block);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return 0;
+
+fail_table:
+    g_free(new_table);
+fail_block:
+    if (*refcount_block != NULL) {
+        qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+    }
+    return ret;
+}
+
+void qcow2_process_discards(BlockDriverState *bs, int ret)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2DiscardRegion *d, *next;
+
+    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
+        QTAILQ_REMOVE(&s->discards, d, next);
+
+        /* Discard is optional, ignore the return value */
+        if (ret >= 0) {
+            bdrv_discard(bs->file,
+                         d->offset >> BDRV_SECTOR_BITS,
+                         d->bytes >> BDRV_SECTOR_BITS);
+        }
+
+        g_free(d);
+    }
+}
+
+static void update_refcount_discard(BlockDriverState *bs,
+                                    uint64_t offset, uint64_t length)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2DiscardRegion *d, *p, *next;
+
+    QTAILQ_FOREACH(d, &s->discards, next) {
+        uint64_t new_start = MIN(offset, d->offset);
+        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
+
+        if (new_end - new_start <= length + d->bytes) {
+            /* There can't be any overlap, areas ending up here have no
+             * references any more and therefore shouldn't get freed another
+             * time. */
+            assert(d->bytes + length == new_end - new_start);
+            d->offset = new_start;
+            d->bytes = new_end - new_start;
+            goto found;
+        }
+    }
+
+    d = g_malloc(sizeof(*d));
+    *d = (Qcow2DiscardRegion) {
+        .bs     = bs,
+        .offset = offset,
+        .bytes  = length,
+    };
+    QTAILQ_INSERT_TAIL(&s->discards, d, next);
+
+found:
+    /* Merge discard requests if they are adjacent now */
+    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
+        if (p == d
+            || p->offset > d->offset + d->bytes
+            || d->offset > p->offset + p->bytes)
+        {
+            continue;
+        }
+
+        /* Still no overlap possible */
+        assert(p->offset == d->offset + d->bytes
+            || d->offset == p->offset + p->bytes);
+
+        QTAILQ_REMOVE(&s->discards, p, next);
+        d->offset = MIN(d->offset, p->offset);
+        d->bytes += p->bytes;
+    }
+}
+
+/* XXX: cache several refcount block clusters ? */
+static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
+    int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t start, last, cluster_offset;
+    uint16_t *refcount_block = NULL;
+    int64_t old_table_index = -1;
+    int ret;
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
+           offset, length, addend);
+#endif
+    if (length < 0) {
+        return -EINVAL;
+    } else if (length == 0) {
+        return 0;
+    }
+
+    if (addend < 0) {
+        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
+            s->l2_table_cache);
+    }
+
+    start = offset & ~(s->cluster_size - 1);
+    last = (offset + length - 1) & ~(s->cluster_size - 1);
+    for(cluster_offset = start; cluster_offset <= last;
+        cluster_offset += s->cluster_size)
+    {
+        int block_index, refcount;
+        int64_t cluster_index = cluster_offset >> s->cluster_bits;
+        int64_t table_index =
+            cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+
+        /* Load the refcount block and allocate it if needed */
+        if (table_index != old_table_index) {
+            if (refcount_block) {
+                ret = qcow2_cache_put(bs, s->refcount_block_cache,
+                    (void**) &refcount_block);
+                if (ret < 0) {
+                    goto fail;
+                }
+            }
+
+            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
+            if (ret < 0) {
+                goto fail;
+            }
+        }
+        old_table_index = table_index;
+
+        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
+
+        /* we can update the count and save it */
+        block_index = cluster_index &
+            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+
+        refcount = be16_to_cpu(refcount_block[block_index]);
+        refcount += addend;
+        if (refcount < 0 || refcount > 0xffff) {
+            ret = -EINVAL;
+            goto fail;
+        }
+        if (refcount == 0 && cluster_index < s->free_cluster_index) {
+            s->free_cluster_index = cluster_index;
+        }
+        refcount_block[block_index] = cpu_to_be16(refcount);
+
+        if (refcount == 0 && s->discard_passthrough[type]) {
+            update_refcount_discard(bs, cluster_offset, s->cluster_size);
+        }
+    }
+
+    ret = 0;
+fail:
+    if (!s->cache_discards) {
+        qcow2_process_discards(bs, ret);
+    }
+
+    /* Write last changed block to disk */
+    if (refcount_block) {
+        int wret;
+        wret = qcow2_cache_put(bs, s->refcount_block_cache,
+            (void**) &refcount_block);
+        if (wret < 0) {
+            return ret < 0 ? ret : wret;
+        }
+    }
+
+    /*
+     * Try do undo any updates if an error is returned (This may succeed in
+     * some cases like ENOSPC for allocating a new refcount block)
+     */
+    if (ret < 0) {
+        int dummy;
+        dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
+                                QCOW2_DISCARD_NEVER);
+        (void)dummy;
+    }
+
+    return ret;
+}
+
+/*
+ * Increases or decreases the refcount of a given cluster by one.
+ * addend must be 1 or -1.
+ *
+ * If the return value is non-negative, it is the new refcount of the cluster.
+ * If it is negative, it is -errno and indicates an error.
+ */
+static int update_cluster_refcount(BlockDriverState *bs,
+                                   int64_t cluster_index,
+                                   int addend,
+                                   enum qcow2_discard_type type)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
+                          type);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return get_refcount(bs, cluster_index);
+}
+
+
+
+/*********************************************************/
+/* cluster allocation functions */
+
+
+
+/* return < 0 if error */
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, nb_clusters, refcount;
+
+    nb_clusters = size_to_clusters(s, size);
+retry:
+    for(i = 0; i < nb_clusters; i++) {
+        int64_t next_cluster_index = s->free_cluster_index++;
+        refcount = get_refcount(bs, next_cluster_index);
+
+        if (refcount < 0) {
+            return refcount;
+        } else if (refcount != 0) {
+            goto retry;
+        }
+    }
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
+            size,
+            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
+#endif
+    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
+}
+
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
+{
+    int64_t offset;
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
+    offset = alloc_clusters_noref(bs, size);
+    if (offset < 0) {
+        return offset;
+    }
+
+    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return offset;
+}
+
+int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+    int nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t cluster_index;
+    uint64_t old_free_cluster_index;
+    int i, refcount, ret;
+
+    /* Check how many clusters there are free */
+    cluster_index = offset >> s->cluster_bits;
+    for(i = 0; i < nb_clusters; i++) {
+        refcount = get_refcount(bs, cluster_index++);
+
+        if (refcount < 0) {
+            return refcount;
+        } else if (refcount != 0) {
+            break;
+        }
+    }
+
+    /* And then allocate them */
+    old_free_cluster_index = s->free_cluster_index;
+    s->free_cluster_index = cluster_index + i;
+
+    ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
+                          QCOW2_DISCARD_NEVER);
+    if (ret < 0) {
+        return ret;
+    }
+
+    s->free_cluster_index = old_free_cluster_index;
+
+    return i;
+}
+
+/* only used to allocate compressed sectors. We try to allocate
+   contiguous sectors. size must be <= cluster_size */
+int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t offset, cluster_offset;
+    int free_in_cluster;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
+    assert(size > 0 && size <= s->cluster_size);
+    if (s->free_byte_offset == 0) {
+        offset = qcow2_alloc_clusters(bs, s->cluster_size);
+        if (offset < 0) {
+            return offset;
+        }
+        s->free_byte_offset = offset;
+    }
+ redo:
+    free_in_cluster = s->cluster_size -
+        (s->free_byte_offset & (s->cluster_size - 1));
+    if (size <= free_in_cluster) {
+        /* enough space in current cluster */
+        offset = s->free_byte_offset;
+        s->free_byte_offset += size;
+        free_in_cluster -= size;
+        if (free_in_cluster == 0)
+            s->free_byte_offset = 0;
+        if ((offset & (s->cluster_size - 1)) != 0)
+            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                    QCOW2_DISCARD_NEVER);
+    } else {
+        offset = qcow2_alloc_clusters(bs, s->cluster_size);
+        if (offset < 0) {
+            return offset;
+        }
+        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
+        if ((cluster_offset + s->cluster_size) == offset) {
+            /* we are lucky: contiguous data */
+            offset = s->free_byte_offset;
+            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                    QCOW2_DISCARD_NEVER);
+            s->free_byte_offset += size;
+        } else {
+            s->free_byte_offset = offset;
+            goto redo;
+        }
+    }
+
+    /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
+     * or explicitly by update_cluster_refcount().  Refcount blocks must be
+     * flushed before the caller's L2 table updates.
+     */
+    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
+    return offset;
+}
+
+void qcow2_free_clusters(BlockDriverState *bs,
+                          int64_t offset, int64_t size,
+                          enum qcow2_discard_type type)
+{
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
+    ret = update_refcount(bs, offset, size, -1, type);
+    if (ret < 0) {
+        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
+        /* TODO Remember the clusters to free them later and avoid leaking */
+    }
+}
+
+/*
+ * Free a cluster using its L2 entry (handles clusters of all types, e.g.
+ * normal cluster, compressed cluster, etc.)
+ */
+void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
+                             int nb_clusters, enum qcow2_discard_type type)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    switch (qcow2_get_cluster_type(l2_entry)) {
+    case QCOW2_CLUSTER_COMPRESSED:
+        {
+            int nb_csectors;
+            nb_csectors = ((l2_entry >> s->csize_shift) &
+                           s->csize_mask) + 1;
+            qcow2_free_clusters(bs,
+                (l2_entry & s->cluster_offset_mask) & ~511,
+                nb_csectors * 512, type);
+        }
+        break;
+    case QCOW2_CLUSTER_NORMAL:
+        qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
+                            nb_clusters << s->cluster_bits, type);
+        break;
+    case QCOW2_CLUSTER_UNALLOCATED:
+    case QCOW2_CLUSTER_ZERO:
+        break;
+    default:
+        abort();
+    }
+}
+
+
+
+/*********************************************************/
+/* snapshots and image creation */
+
+
+
+/* update the refcounts of snapshots and the copied flag */
+int qcow2_update_snapshot_refcount(BlockDriverState *bs,
+    int64_t l1_table_offset, int l1_size, int addend)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
+    int64_t old_offset, old_l2_offset;
+    int i, j, l1_modified = 0, nb_csectors, refcount;
+    int ret;
+
+    l2_table = NULL;
+    l1_table = NULL;
+    l1_size2 = l1_size * sizeof(uint64_t);
+
+    s->cache_discards = true;
+
+    /* WARNING: qcow2_snapshot_goto relies on this function not using the
+     * l1_table_offset when it is the current s->l1_table_offset! Be careful
+     * when changing this! */
+    if (l1_table_offset != s->l1_table_offset) {
+        l1_table = g_malloc0(align_offset(l1_size2, 512));
+        l1_allocated = 1;
+
+        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        for(i = 0;i < l1_size; i++)
+            be64_to_cpus(&l1_table[i]);
+    } else {
+        assert(l1_size == s->l1_size);
+        l1_table = s->l1_table;
+        l1_allocated = 0;
+    }
+
+    for(i = 0; i < l1_size; i++) {
+        l2_offset = l1_table[i];
+        if (l2_offset) {
+            old_l2_offset = l2_offset;
+            l2_offset &= L1E_OFFSET_MASK;
+
+            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+                (void**) &l2_table);
+            if (ret < 0) {
+                goto fail;
+            }
+
+            for(j = 0; j < s->l2_size; j++) {
+                offset = be64_to_cpu(l2_table[j]);
+                if (offset != 0) {
+                    old_offset = offset;
+                    offset &= ~QCOW_OFLAG_COPIED;
+                    if (offset & QCOW_OFLAG_COMPRESSED) {
+                        nb_csectors = ((offset >> s->csize_shift) &
+                                       s->csize_mask) + 1;
+                        if (addend != 0) {
+                            int ret;
+                            ret = update_refcount(bs,
+                                (offset & s->cluster_offset_mask) & ~511,
+                                nb_csectors * 512, addend,
+                                QCOW2_DISCARD_SNAPSHOT);
+                            if (ret < 0) {
+                                goto fail;
+                            }
+                        }
+                        /* compressed clusters are never modified */
+                        refcount = 2;
+                    } else {
+                        uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
+                        if (addend != 0) {
+                            refcount = update_cluster_refcount(bs, cluster_index, addend,
+                                                               QCOW2_DISCARD_SNAPSHOT);
+                        } else {
+                            refcount = get_refcount(bs, cluster_index);
+                        }
+
+                        if (refcount < 0) {
+                            ret = refcount;
+                            goto fail;
+                        }
+                    }
+
+                    if (refcount == 1) {
+                        offset |= QCOW_OFLAG_COPIED;
+                    }
+                    if (offset != old_offset) {
+                        if (addend > 0) {
+                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
+                                s->refcount_block_cache);
+                        }
+                        l2_table[j] = cpu_to_be64(offset);
+                        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+                    }
+                }
+            }
+
+            ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+            if (ret < 0) {
+                goto fail;
+            }
+
+
+            if (addend != 0) {
+                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend,
+                                                   QCOW2_DISCARD_SNAPSHOT);
+            } else {
+                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+            }
+            if (refcount < 0) {
+                ret = refcount;
+                goto fail;
+            } else if (refcount == 1) {
+                l2_offset |= QCOW_OFLAG_COPIED;
+            }
+            if (l2_offset != old_l2_offset) {
+                l1_table[i] = l2_offset;
+                l1_modified = 1;
+            }
+        }
+    }
+
+    ret = bdrv_flush(bs);
+fail:
+    if (l2_table) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    }
+
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
+    /* Update L1 only if it isn't deleted anyway (addend = -1) */
+    if (ret == 0 && addend >= 0 && l1_modified) {
+        for (i = 0; i < l1_size; i++) {
+            cpu_to_be64s(&l1_table[i]);
+        }
+
+        ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
+
+        for (i = 0; i < l1_size; i++) {
+            be64_to_cpus(&l1_table[i]);
+        }
+    }
+    if (l1_allocated)
+        g_free(l1_table);
+    return ret;
+}
+
+
+
+
+/*********************************************************/
+/* refcount checking functions */
+
+
+
+/*
+ * Increases the refcount for a range of clusters in a given refcount table.
+ * This is used to construct a temporary refcount table out of L1 and L2 tables
+ * which can be compared the the refcount table saved in the image.
+ *
+ * Modifies the number of errors in res.
+ */
+static void inc_refcounts(BlockDriverState *bs,
+                          BdrvCheckResult *res,
+                          uint16_t *refcount_table,
+                          int refcount_table_size,
+                          int64_t offset, int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t start, last, cluster_offset;
+    int k;
+
+    if (size <= 0)
+        return;
+
+    start = offset & ~(s->cluster_size - 1);
+    last = (offset + size - 1) & ~(s->cluster_size - 1);
+    for(cluster_offset = start; cluster_offset <= last;
+        cluster_offset += s->cluster_size) {
+        k = cluster_offset >> s->cluster_bits;
+        if (k < 0) {
+            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
+                cluster_offset);
+            res->corruptions++;
+        } else if (k >= refcount_table_size) {
+            fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
+                "the end of the image file, can't properly check refcounts.\n",
+                cluster_offset);
+            res->check_errors++;
+        } else {
+            if (++refcount_table[k] == 0) {
+                fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
+                    "\n", cluster_offset);
+                res->corruptions++;
+            }
+        }
+    }
+}
+
+/* Flags for check_refcounts_l1() and check_refcounts_l2() */
+enum {
+    CHECK_OFLAG_COPIED = 0x1,   /* check QCOW_OFLAG_COPIED matches refcount */
+    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
+};
+
+/*
+ * Increases the refcount in the given refcount table for the all clusters
+ * referenced in the L2 table. While doing so, performs some checks on L2
+ * entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+    uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
+    int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l2_table, l2_entry;
+    uint64_t next_contiguous_offset = 0;
+    int i, l2_size, nb_csectors, refcount;
+
+    /* Read L2 table from disk */
+    l2_size = s->l2_size * sizeof(uint64_t);
+    l2_table = g_malloc(l2_size);
+
+    if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
+        goto fail;
+
+    /* Do the actual checks */
+    for(i = 0; i < s->l2_size; i++) {
+        l2_entry = be64_to_cpu(l2_table[i]);
+
+        switch (qcow2_get_cluster_type(l2_entry)) {
+        case QCOW2_CLUSTER_COMPRESSED:
+            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
+            if (l2_entry & QCOW_OFLAG_COPIED) {
+                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
+                    "copied flag must never be set for compressed "
+                    "clusters\n", l2_entry >> s->cluster_bits);
+                l2_entry &= ~QCOW_OFLAG_COPIED;
+                res->corruptions++;
+            }
+
+            /* Mark cluster as used */
+            nb_csectors = ((l2_entry >> s->csize_shift) &
+                           s->csize_mask) + 1;
+            l2_entry &= s->cluster_offset_mask;
+            inc_refcounts(bs, res, refcount_table, refcount_table_size,
+                l2_entry & ~511, nb_csectors * 512);
+
+            if (flags & CHECK_FRAG_INFO) {
+                res->bfi.allocated_clusters++;
+                res->bfi.compressed_clusters++;
+
+                /* Compressed clusters are fragmented by nature.  Since they
+                 * take up sub-sector space but we only have sector granularity
+                 * I/O we need to re-read the same sectors even for adjacent
+                 * compressed clusters.
+                 */
+                res->bfi.fragmented_clusters++;
+            }
+            break;
+
+        case QCOW2_CLUSTER_ZERO:
+            if ((l2_entry & L2E_OFFSET_MASK) == 0) {
+                break;
+            }
+            /* fall through */
+
+        case QCOW2_CLUSTER_NORMAL:
+        {
+            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
+
+            if (flags & CHECK_OFLAG_COPIED) {
+                refcount = get_refcount(bs, offset >> s->cluster_bits);
+                if (refcount < 0) {
+                    fprintf(stderr, "Can't get refcount for offset %"
+                        PRIx64 ": %s\n", l2_entry, strerror(-refcount));
+                    goto fail;
+                }
+                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
+                    fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
+                        PRIx64 " refcount=%d\n", l2_entry, refcount);
+                    res->corruptions++;
+                }
+            }
+
+            if (flags & CHECK_FRAG_INFO) {
+                res->bfi.allocated_clusters++;
+                if (next_contiguous_offset &&
+                    offset != next_contiguous_offset) {
+                    res->bfi.fragmented_clusters++;
+                }
+                next_contiguous_offset = offset + s->cluster_size;
+            }
+
+            /* Mark cluster as used */
+            inc_refcounts(bs, res, refcount_table,refcount_table_size,
+                offset, s->cluster_size);
+
+            /* Correct offsets are cluster aligned */
+            if (offset & (s->cluster_size - 1)) {
+                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
+                    "properly aligned; L2 entry corrupted.\n", offset);
+                res->corruptions++;
+            }
+            break;
+        }
+
+        case QCOW2_CLUSTER_UNALLOCATED:
+            break;
+
+        default:
+            abort();
+        }
+    }
+
+    g_free(l2_table);
+    return 0;
+
+fail:
+    fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
+    g_free(l2_table);
+    return -EIO;
+}
+
+/*
+ * Increases the refcount for the L1 table, its L2 tables and all referenced
+ * clusters in the given refcount table. While doing so, performs some checks
+ * on L1 and L2 entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l1(BlockDriverState *bs,
+                              BdrvCheckResult *res,
+                              uint16_t *refcount_table,
+                              int refcount_table_size,
+                              int64_t l1_table_offset, int l1_size,
+                              int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l1_table, l2_offset, l1_size2;
+    int i, refcount, ret;
+
+    l1_size2 = l1_size * sizeof(uint64_t);
+
+    /* Mark L1 table as used */
+    inc_refcounts(bs, res, refcount_table, refcount_table_size,
+        l1_table_offset, l1_size2);
+
+    /* Read L1 table entries from disk */
+    if (l1_size2 == 0) {
+        l1_table = NULL;
+    } else {
+        l1_table = g_malloc(l1_size2);
+        if (bdrv_pread(bs->file, l1_table_offset,
+                       l1_table, l1_size2) != l1_size2)
+            goto fail;
+        for(i = 0;i < l1_size; i++)
+            be64_to_cpus(&l1_table[i]);
+    }
+
+    /* Do the actual checks */
+    for(i = 0; i < l1_size; i++) {
+        l2_offset = l1_table[i];
+        if (l2_offset) {
+            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+            if (flags & CHECK_OFLAG_COPIED) {
+                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
+                    >> s->cluster_bits);
+                if (refcount < 0) {
+                    fprintf(stderr, "Can't get refcount for l2_offset %"
+                        PRIx64 ": %s\n", l2_offset, strerror(-refcount));
+                    goto fail;
+                }
+                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
+                    fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
+                        " refcount=%d\n", l2_offset, refcount);
+                    res->corruptions++;
+                }
+            }
+
+            /* Mark L2 table as used */
+            l2_offset &= L1E_OFFSET_MASK;
+            inc_refcounts(bs, res, refcount_table, refcount_table_size,
+                l2_offset, s->cluster_size);
+
+            /* L2 tables are cluster aligned */
+            if (l2_offset & (s->cluster_size - 1)) {
+                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
+                    "cluster aligned; L1 entry corrupted\n", l2_offset);
+                res->corruptions++;
+            }
+
+            /* Process and check L2 entries */
+            ret = check_refcounts_l2(bs, res, refcount_table,
+                                     refcount_table_size, l2_offset, flags);
+            if (ret < 0) {
+                goto fail;
+            }
+        }
+    }
+    g_free(l1_table);
+    return 0;
+
+fail:
+    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+    res->check_errors++;
+    g_free(l1_table);
+    return -EIO;
+}
+
+/*
+ * Checks an image for refcount consistency.
+ *
+ * Returns 0 if no errors are found, the number of errors in case the image is
+ * detected as corrupted, and -errno when an internal error occurred.
+ */
+int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                          BdrvCheckMode fix)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t size, i, highest_cluster;
+    int nb_clusters, refcount1, refcount2;
+    QCowSnapshot *sn;
+    uint16_t *refcount_table;
+    int ret;
+
+    size = bdrv_getlength(bs->file);
+    nb_clusters = size_to_clusters(s, size);
+    refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
+
+    res->bfi.total_clusters =
+        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
+
+    /* header */
+    inc_refcounts(bs, res, refcount_table, nb_clusters,
+        0, s->cluster_size);
+
+    /* current L1 table */
+    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
+                             s->l1_table_offset, s->l1_size,
+                             CHECK_OFLAG_COPIED | CHECK_FRAG_INFO);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* snapshots */
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
+            sn->l1_table_offset, sn->l1_size, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+    inc_refcounts(bs, res, refcount_table, nb_clusters,
+        s->snapshots_offset, s->snapshots_size);
+
+    /* refcount data */
+    inc_refcounts(bs, res, refcount_table, nb_clusters,
+        s->refcount_table_offset,
+        s->refcount_table_size * sizeof(uint64_t));
+
+    for(i = 0; i < s->refcount_table_size; i++) {
+        uint64_t offset, cluster;
+        offset = s->refcount_table[i];
+        cluster = offset >> s->cluster_bits;
+
+        /* Refcount blocks are cluster aligned */
+        if (offset & (s->cluster_size - 1)) {
+            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
+                "cluster aligned; refcount table entry corrupted\n", i);
+            res->corruptions++;
+            continue;
+        }
+
+        if (cluster >= nb_clusters) {
+            fprintf(stderr, "ERROR refcount block %" PRId64
+                    " is outside image\n", i);
+            res->corruptions++;
+            continue;
+        }
+
+        if (offset != 0) {
+            inc_refcounts(bs, res, refcount_table, nb_clusters,
+                offset, s->cluster_size);
+            if (refcount_table[cluster] != 1) {
+                fprintf(stderr, "ERROR refcount block %" PRId64
+                    " refcount=%d\n",
+                    i, refcount_table[cluster]);
+                res->corruptions++;
+            }
+        }
+    }
+
+    /* compare ref counts */
+    for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
+        refcount1 = get_refcount(bs, i);
+        if (refcount1 < 0) {
+            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
+                i, strerror(-refcount1));
+            res->check_errors++;
+            continue;
+        }
+
+        refcount2 = refcount_table[i];
+
+        if (refcount1 > 0 || refcount2 > 0) {
+            highest_cluster = i;
+        }
+
+        if (refcount1 != refcount2) {
+
+            /* Check if we're allowed to fix the mismatch */
+            int *num_fixed = NULL;
+            if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
+                num_fixed = &res->leaks_fixed;
+            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
+                num_fixed = &res->corruptions_fixed;
+            }
+
+            fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
+                   num_fixed != NULL     ? "Repairing" :
+                   refcount1 < refcount2 ? "ERROR" :
+                                           "Leaked",
+                   i, refcount1, refcount2);
+
+            if (num_fixed) {
+                ret = update_refcount(bs, i << s->cluster_bits, 1,
+                                      refcount2 - refcount1,
+                                      QCOW2_DISCARD_ALWAYS);
+                if (ret >= 0) {
+                    (*num_fixed)++;
+                    continue;
+                }
+            }
+
+            /* And if we couldn't, print an error */
+            if (refcount1 < refcount2) {
+                res->corruptions++;
+            } else {
+                res->leaks++;
+            }
+        }
+    }
+
+    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
+    ret = 0;
+
+fail:
+    g_free(refcount_table);
+
+    return ret;
+}
+
diff --git a/contrib/qemu/block/qcow2-snapshot.c b/contrib/qemu/block/qcow2-snapshot.c
new file mode 100644
index 000000000..0caac9055
--- /dev/null
+++ b/contrib/qemu/block/qcow2-snapshot.c
@@ -0,0 +1,660 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/qcow2.h"
+
+typedef struct QEMU_PACKED QCowSnapshotHeader {
+    /* header is 8 byte aligned */
+    uint64_t l1_table_offset;
+
+    uint32_t l1_size;
+    uint16_t id_str_size;
+    uint16_t name_size;
+
+    uint32_t date_sec;
+    uint32_t date_nsec;
+
+    uint64_t vm_clock_nsec;
+
+    uint32_t vm_state_size;
+    uint32_t extra_data_size; /* for extension */
+    /* extra data follows */
+    /* id_str follows */
+    /* name follows  */
+} QCowSnapshotHeader;
+
+typedef struct QEMU_PACKED QCowSnapshotExtraData {
+    uint64_t vm_state_size_large;
+    uint64_t disk_size;
+} QCowSnapshotExtraData;
+
+void qcow2_free_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        g_free(s->snapshots[i].name);
+        g_free(s->snapshots[i].id_str);
+    }
+    g_free(s->snapshots);
+    s->snapshots = NULL;
+    s->nb_snapshots = 0;
+}
+
+int qcow2_read_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshotHeader h;
+    QCowSnapshotExtraData extra;
+    QCowSnapshot *sn;
+    int i, id_str_size, name_size;
+    int64_t offset;
+    uint32_t extra_data_size;
+    int ret;
+
+    if (!s->nb_snapshots) {
+        s->snapshots = NULL;
+        s->snapshots_size = 0;
+        return 0;
+    }
+
+    offset = s->snapshots_offset;
+    s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot));
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        /* Read statically sized part of the snapshot header */
+        offset = align_offset(offset, 8);
+        ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
+        if (ret < 0) {
+            goto fail;
+        }
+
+        offset += sizeof(h);
+        sn = s->snapshots + i;
+        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
+        sn->l1_size = be32_to_cpu(h.l1_size);
+        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
+        sn->date_sec = be32_to_cpu(h.date_sec);
+        sn->date_nsec = be32_to_cpu(h.date_nsec);
+        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
+        extra_data_size = be32_to_cpu(h.extra_data_size);
+
+        id_str_size = be16_to_cpu(h.id_str_size);
+        name_size = be16_to_cpu(h.name_size);
+
+        /* Read extra data */
+        ret = bdrv_pread(bs->file, offset, &extra,
+                         MIN(sizeof(extra), extra_data_size));
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += extra_data_size;
+
+        if (extra_data_size >= 8) {
+            sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large);
+        }
+
+        if (extra_data_size >= 16) {
+            sn->disk_size = be64_to_cpu(extra.disk_size);
+        } else {
+            sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+        }
+
+        /* Read snapshot ID */
+        sn->id_str = g_malloc(id_str_size + 1);
+        ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += id_str_size;
+        sn->id_str[id_str_size] = '\0';
+
+        /* Read snapshot name */
+        sn->name = g_malloc(name_size + 1);
+        ret = bdrv_pread(bs->file, offset, sn->name, name_size);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += name_size;
+        sn->name[name_size] = '\0';
+    }
+
+    s->snapshots_size = offset - s->snapshots_offset;
+    return 0;
+
+fail:
+    qcow2_free_snapshots(bs);
+    return ret;
+}
+
+/* add at the end of the file a new list of snapshots */
+static int qcow2_write_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    QCowSnapshotHeader h;
+    QCowSnapshotExtraData extra;
+    int i, name_size, id_str_size, snapshots_size;
+    struct {
+        uint32_t nb_snapshots;
+        uint64_t snapshots_offset;
+    } QEMU_PACKED header_data;
+    int64_t offset, snapshots_offset;
+    int ret;
+
+    /* compute the size of the snapshots */
+    offset = 0;
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        offset = align_offset(offset, 8);
+        offset += sizeof(h);
+        offset += sizeof(extra);
+        offset += strlen(sn->id_str);
+        offset += strlen(sn->name);
+    }
+    snapshots_size = offset;
+
+    /* Allocate space for the new snapshot list */
+    snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
+    offset = snapshots_offset;
+    if (offset < 0) {
+        return offset;
+    }
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Write all snapshots to the new list */
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        memset(&h, 0, sizeof(h));
+        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
+        h.l1_size = cpu_to_be32(sn->l1_size);
+        /* If it doesn't fit in 32 bit, older implementations should treat it
+         * as a disk-only snapshot rather than truncate the VM state */
+        if (sn->vm_state_size <= 0xffffffff) {
+            h.vm_state_size = cpu_to_be32(sn->vm_state_size);
+        }
+        h.date_sec = cpu_to_be32(sn->date_sec);
+        h.date_nsec = cpu_to_be32(sn->date_nsec);
+        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
+        h.extra_data_size = cpu_to_be32(sizeof(extra));
+
+        memset(&extra, 0, sizeof(extra));
+        extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size);
+        extra.disk_size = cpu_to_be64(sn->disk_size);
+
+        id_str_size = strlen(sn->id_str);
+        name_size = strlen(sn->name);
+        h.id_str_size = cpu_to_be16(id_str_size);
+        h.name_size = cpu_to_be16(name_size);
+        offset = align_offset(offset, 8);
+
+        ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h));
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += sizeof(h);
+
+        ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra));
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += sizeof(extra);
+
+        ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += id_str_size;
+
+        ret = bdrv_pwrite(bs->file, offset, sn->name, name_size);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += name_size;
+    }
+
+    /*
+     * Update the header to point to the new snapshot table. This requires the
+     * new table and its refcounts to be stable on disk.
+     */
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) !=
+        offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots));
+
+    header_data.nb_snapshots        = cpu_to_be32(s->nb_snapshots);
+    header_data.snapshots_offset    = cpu_to_be64(snapshots_offset);
+
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
+                           &header_data, sizeof(header_data));
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* free the old snapshot table */
+    qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size,
+                        QCOW2_DISCARD_SNAPSHOT);
+    s->snapshots_offset = snapshots_offset;
+    s->snapshots_size = snapshots_size;
+    return 0;
+
+fail:
+    return ret;
+}
+
+static void find_new_snapshot_id(BlockDriverState *bs,
+                                 char *id_str, int id_str_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    int i, id, id_max = 0;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        id = strtoul(sn->id_str, NULL, 10);
+        if (id > id_max)
+            id_max = id;
+    }
+    snprintf(id_str, id_str_size, "%d", id_max + 1);
+}
+
+static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        if (!strcmp(s->snapshots[i].id_str, id_str))
+            return i;
+    }
+    return -1;
+}
+
+static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, ret;
+
+    ret = find_snapshot_by_id(bs, name);
+    if (ret >= 0)
+        return ret;
+    for(i = 0; i < s->nb_snapshots; i++) {
+        if (!strcmp(s->snapshots[i].name, name))
+            return i;
+    }
+    return -1;
+}
+
+/* if no id is provided, a new one is constructed */
+int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *new_snapshot_list = NULL;
+    QCowSnapshot *old_snapshot_list = NULL;
+    QCowSnapshot sn1, *sn = &sn1;
+    int i, ret;
+    uint64_t *l1_table = NULL;
+    int64_t l1_table_offset;
+
+    memset(sn, 0, sizeof(*sn));
+
+    /* Generate an ID if it wasn't passed */
+    if (sn_info->id_str[0] == '\0') {
+        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
+    }
+
+    /* Check that the ID is unique */
+    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) {
+        return -EEXIST;
+    }
+
+    /* Populate sn with passed data */
+    sn->id_str = g_strdup(sn_info->id_str);
+    sn->name = g_strdup(sn_info->name);
+
+    sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+    sn->vm_state_size = sn_info->vm_state_size;
+    sn->date_sec = sn_info->date_sec;
+    sn->date_nsec = sn_info->date_nsec;
+    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
+
+    /* Allocate the L1 table of the snapshot and copy the current one there. */
+    l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
+    if (l1_table_offset < 0) {
+        ret = l1_table_offset;
+        goto fail;
+    }
+
+    sn->l1_table_offset = l1_table_offset;
+    sn->l1_size = s->l1_size;
+
+    l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+    for(i = 0; i < s->l1_size; i++) {
+        l1_table[i] = cpu_to_be64(s->l1_table[i]);
+    }
+
+    ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table,
+                      s->l1_size * sizeof(uint64_t));
+    if (ret < 0) {
+        goto fail;
+    }
+
+    g_free(l1_table);
+    l1_table = NULL;
+
+    /*
+     * Increase the refcounts of all clusters and make sure everything is
+     * stable on disk before updating the snapshot table to contain a pointer
+     * to the new L1 table.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Append the new snapshot to the snapshot list */
+    new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
+    if (s->snapshots) {
+        memcpy(new_snapshot_list, s->snapshots,
+               s->nb_snapshots * sizeof(QCowSnapshot));
+        old_snapshot_list = s->snapshots;
+    }
+    s->snapshots = new_snapshot_list;
+    s->snapshots[s->nb_snapshots++] = *sn;
+
+    ret = qcow2_write_snapshots(bs);
+    if (ret < 0) {
+        g_free(s->snapshots);
+        s->snapshots = old_snapshot_list;
+        goto fail;
+    }
+
+    g_free(old_snapshot_list);
+
+#ifdef DEBUG_ALLOC
+    {
+      BdrvCheckResult result = {0};
+      qcow2_check_refcounts(bs, &result, 0);
+    }
+#endif
+    return 0;
+
+fail:
+    g_free(sn->id_str);
+    g_free(sn->name);
+    g_free(l1_table);
+
+    return ret;
+}
+
+/* copy the snapshot 'snapshot_name' into the current disk image */
+int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    int i, snapshot_index;
+    int cur_l1_bytes, sn_l1_bytes;
+    int ret;
+    uint64_t *sn_l1_table = NULL;
+
+    /* Search the snapshot */
+    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+    if (snapshot_index < 0) {
+        return -ENOENT;
+    }
+    sn = &s->snapshots[snapshot_index];
+
+    if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) {
+        error_report("qcow2: Loading snapshots with different disk "
+            "size is not implemented");
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    /*
+     * Make sure that the current L1 table is big enough to contain the whole
+     * L1 table of the snapshot. If the snapshot L1 table is smaller, the
+     * current one must be padded with zeros.
+     */
+    ret = qcow2_grow_l1_table(bs, sn->l1_size, true);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    cur_l1_bytes = s->l1_size * sizeof(uint64_t);
+    sn_l1_bytes = sn->l1_size * sizeof(uint64_t);
+
+    /*
+     * Copy the snapshot L1 table to the current L1 table.
+     *
+     * Before overwriting the old current L1 table on disk, make sure to
+     * increase all refcounts for the clusters referenced by the new one.
+     * Decrease the refcount referenced by the old one only when the L1
+     * table is overwritten.
+     */
+    sn_l1_table = g_malloc0(cur_l1_bytes);
+
+    ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset,
+                                         sn->l1_size, 1);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table,
+                           cur_l1_bytes);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /*
+     * Decrease refcount of clusters of current L1 table.
+     *
+     * At this point, the in-memory s->l1_table points to the old L1 table,
+     * whereas on disk we already have the new one.
+     *
+     * qcow2_update_snapshot_refcount special cases the current L1 table to use
+     * the in-memory data instead of really using the offset to load a new one,
+     * which is why this works.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset,
+                                         s->l1_size, -1);
+
+    /*
+     * Now update the in-memory L1 table to be in sync with the on-disk one. We
+     * need to do this even if updating refcounts failed.
+     */
+    for(i = 0;i < s->l1_size; i++) {
+        s->l1_table[i] = be64_to_cpu(sn_l1_table[i]);
+    }
+
+    if (ret < 0) {
+        goto fail;
+    }
+
+    g_free(sn_l1_table);
+    sn_l1_table = NULL;
+
+    /*
+     * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed
+     * when we decreased the refcount of the old snapshot.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+    if (ret < 0) {
+        goto fail;
+    }
+
+#ifdef DEBUG_ALLOC
+    {
+        BdrvCheckResult result = {0};
+        qcow2_check_refcounts(bs, &result, 0);
+    }
+#endif
+    return 0;
+
+fail:
+    g_free(sn_l1_table);
+    return ret;
+}
+
+int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot sn;
+    int snapshot_index, ret;
+
+    /* Search the snapshot */
+    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+    if (snapshot_index < 0) {
+        return -ENOENT;
+    }
+    sn = s->snapshots[snapshot_index];
+
+    /* Remove it from the snapshot list */
+    memmove(s->snapshots + snapshot_index,
+            s->snapshots + snapshot_index + 1,
+            (s->nb_snapshots - snapshot_index - 1) * sizeof(sn));
+    s->nb_snapshots--;
+    ret = qcow2_write_snapshots(bs);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /*
+     * The snapshot is now unused, clean up. If we fail after this point, we
+     * won't recover but just leak clusters.
+     */
+    g_free(sn.id_str);
+    g_free(sn.name);
+
+    /*
+     * Now decrease the refcounts of clusters referenced by the snapshot and
+     * free the L1 table.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset,
+                                         sn.l1_size, -1);
+    if (ret < 0) {
+        return ret;
+    }
+    qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_SNAPSHOT);
+
+    /* must update the copied flag on the current cluster offsets */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+#ifdef DEBUG_ALLOC
+    {
+        BdrvCheckResult result = {0};
+        qcow2_check_refcounts(bs, &result, 0);
+    }
+#endif
+    return 0;
+}
+
+int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
+{
+    BDRVQcowState *s = bs->opaque;
+    QEMUSnapshotInfo *sn_tab, *sn_info;
+    QCowSnapshot *sn;
+    int i;
+
+    if (!s->nb_snapshots) {
+        *psn_tab = NULL;
+        return s->nb_snapshots;
+    }
+
+    sn_tab = g_malloc0(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn_info = sn_tab + i;
+        sn = s->snapshots + i;
+        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
+                sn->id_str);
+        pstrcpy(sn_info->name, sizeof(sn_info->name),
+                sn->name);
+        sn_info->vm_state_size = sn->vm_state_size;
+        sn_info->date_sec = sn->date_sec;
+        sn_info->date_nsec = sn->date_nsec;
+        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
+    }
+    *psn_tab = sn_tab;
+    return s->nb_snapshots;
+}
+
+int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
+{
+    int i, snapshot_index;
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    uint64_t *new_l1_table;
+    int new_l1_bytes;
+    int ret;
+
+    assert(bs->read_only);
+
+    /* Search the snapshot */
+    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name);
+    if (snapshot_index < 0) {
+        return -ENOENT;
+    }
+    sn = &s->snapshots[snapshot_index];
+
+    /* Allocate and read in the snapshot's L1 table */
+    new_l1_bytes = s->l1_size * sizeof(uint64_t);
+    new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));
+
+    ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
+    if (ret < 0) {
+        g_free(new_l1_table);
+        return ret;
+    }
+
+    /* Switch the L1 table */
+    g_free(s->l1_table);
+
+    s->l1_size = sn->l1_size;
+    s->l1_table_offset = sn->l1_table_offset;
+    s->l1_table = new_l1_table;
+
+    for(i = 0;i < s->l1_size; i++) {
+        be64_to_cpus(&s->l1_table[i]);
+    }
+
+    return 0;
+}
diff --git a/contrib/qemu/block/qcow2.c b/contrib/qemu/block/qcow2.c
new file mode 100644
index 000000000..0eceefe2c
--- /dev/null
+++ b/contrib/qemu/block/qcow2.c
@@ -0,0 +1,1825 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include <zlib.h>
+#include "qemu/aes.h"
+#include "block/qcow2.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qbool.h"
+#include "trace.h"
+
+/*
+  Differences with QCOW:
+
+  - Support for multiple incremental snapshots.
+  - Memory management by reference counts.
+  - Clusters which have a reference count of one have the bit
+    QCOW_OFLAG_COPIED to optimize write performance.
+  - Size of compressed clusters is stored in sectors to reduce bit usage
+    in the cluster offsets.
+  - Support for storing additional data (such as the VM state) in the
+    snapshots.
+  - If a backing store is used, the cluster size is not constrained
+    (could be backported to QCOW).
+  - L2 tables have always a size of one cluster.
+*/
+
+
+typedef struct {
+    uint32_t magic;
+    uint32_t len;
+} QCowExtension;
+
+#define  QCOW2_EXT_MAGIC_END 0
+#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
+#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
+
+static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const QCowHeader *cow_header = (const void *)buf;
+
+    if (buf_size >= sizeof(QCowHeader) &&
+        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+        be32_to_cpu(cow_header->version) >= 2)
+        return 100;
+    else
+        return 0;
+}
+
+
+/* 
+ * read qcow2 extension and fill bs
+ * start reading from start_offset
+ * finish reading upon magic of value 0 or when end_offset reached
+ * unknown magic is skipped (future extension this version knows nothing about)
+ * return 0 upon success, non-0 otherwise
+ */
+static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
+                                 uint64_t end_offset, void **p_feature_table)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowExtension ext;
+    uint64_t offset;
+    int ret;
+
+#ifdef DEBUG_EXT
+    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
+#endif
+    offset = start_offset;
+    while (offset < end_offset) {
+
+#ifdef DEBUG_EXT
+        /* Sanity check */
+        if (offset > s->cluster_size)
+            printf("qcow2_read_extension: suspicious offset %lu\n", offset);
+
+        printf("attempting to read extended header in offset %lu\n", offset);
+#endif
+
+        if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) {
+            fprintf(stderr, "qcow2_read_extension: ERROR: "
+                    "pread fail from offset %" PRIu64 "\n",
+                    offset);
+            return 1;
+        }
+        be32_to_cpus(&ext.magic);
+        be32_to_cpus(&ext.len);
+        offset += sizeof(ext);
+#ifdef DEBUG_EXT
+        printf("ext.magic = 0x%x\n", ext.magic);
+#endif
+        if (ext.len > end_offset - offset) {
+            error_report("Header extension too large");
+            return -EINVAL;
+        }
+
+        switch (ext.magic) {
+        case QCOW2_EXT_MAGIC_END:
+            return 0;
+
+        case QCOW2_EXT_MAGIC_BACKING_FORMAT:
+            if (ext.len >= sizeof(bs->backing_format)) {
+                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
+                        " (>=%zu)\n",
+                        ext.len, sizeof(bs->backing_format));
+                return 2;
+            }
+            if (bdrv_pread(bs->file, offset , bs->backing_format,
+                           ext.len) != ext.len)
+                return 3;
+            bs->backing_format[ext.len] = '\0';
+#ifdef DEBUG_EXT
+            printf("Qcow2: Got format extension %s\n", bs->backing_format);
+#endif
+            break;
+
+        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
+            if (p_feature_table != NULL) {
+                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
+                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
+                if (ret < 0) {
+                    return ret;
+                }
+
+                *p_feature_table = feature_table;
+            }
+            break;
+
+        default:
+            /* unknown magic - save it in case we need to rewrite the header */
+            {
+                Qcow2UnknownHeaderExtension *uext;
+
+                uext = g_malloc0(sizeof(*uext)  + ext.len);
+                uext->magic = ext.magic;
+                uext->len = ext.len;
+                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
+
+                ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
+                if (ret < 0) {
+                    return ret;
+                }
+            }
+            break;
+        }
+
+        offset += ((ext.len + 7) & ~7);
+    }
+
+    return 0;
+}
+
+static void cleanup_unknown_header_ext(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2UnknownHeaderExtension *uext, *next;
+
+    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
+        QLIST_REMOVE(uext, next);
+        g_free(uext);
+    }
+}
+
+static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs,
+    const char *fmt, ...)
+{
+    char msg[64];
+    va_list ap;
+
+    va_start(ap, fmt);
+    vsnprintf(msg, sizeof(msg), fmt, ap);
+    va_end(ap);
+
+    qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+        bs->device_name, "qcow2", msg);
+}
+
+static void report_unsupported_feature(BlockDriverState *bs,
+    Qcow2Feature *table, uint64_t mask)
+{
+    while (table && table->name[0] != '\0') {
+        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
+            if (mask & (1 << table->bit)) {
+                report_unsupported(bs, "%.46s",table->name);
+                mask &= ~(1 << table->bit);
+            }
+        }
+        table++;
+    }
+
+    if (mask) {
+        report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask);
+    }
+}
+
+/*
+ * Sets the dirty bit and flushes afterwards if necessary.
+ *
+ * The incompatible_features bit is only set if the image file header was
+ * updated successfully.  Therefore it is not required to check the return
+ * value of this function.
+ */
+int qcow2_mark_dirty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t val;
+    int ret;
+
+    assert(s->qcow_version >= 3);
+
+    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+        return 0; /* already dirty */
+    }
+
+    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
+    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
+                      &val, sizeof(val));
+    if (ret < 0) {
+        return ret;
+    }
+    ret = bdrv_flush(bs->file);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Only treat image as dirty if the header was updated successfully */
+    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
+    return 0;
+}
+
+/*
+ * Clears the dirty bit and flushes before if necessary.  Only call this
+ * function when there are no pending requests, it does not guard against
+ * concurrent requests dirtying the image.
+ */
+static int qcow2_mark_clean(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+        int ret = bdrv_flush(bs);
+        if (ret < 0) {
+            return ret;
+        }
+
+        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
+        return qcow2_update_header(bs);
+    }
+    return 0;
+}
+
+static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
+                       BdrvCheckMode fix)
+{
+    int ret = qcow2_check_refcounts(bs, result, fix);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (fix && result->check_errors == 0 && result->corruptions == 0) {
+        return qcow2_mark_clean(bs);
+    }
+    return ret;
+}
+
+static QemuOptsList qcow2_runtime_opts = {
+    .name = "qcow2",
+    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
+    .desc = {
+        {
+            .name = "lazy_refcounts",
+            .type = QEMU_OPT_BOOL,
+            .help = "Postpone refcount updates",
+        },
+        {
+            .name = QCOW2_OPT_DISCARD_REQUEST,
+            .type = QEMU_OPT_BOOL,
+            .help = "Pass guest discard requests to the layer below",
+        },
+        {
+            .name = QCOW2_OPT_DISCARD_SNAPSHOT,
+            .type = QEMU_OPT_BOOL,
+            .help = "Generate discard requests when snapshot related space "
+                    "is freed",
+        },
+        {
+            .name = QCOW2_OPT_DISCARD_OTHER,
+            .type = QEMU_OPT_BOOL,
+            .help = "Generate discard requests when other clusters are freed",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    int len, i, ret = 0;
+    QCowHeader header;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    uint64_t ext_end;
+    uint64_t l1_vm_state_index;
+
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    if (ret < 0) {
+        goto fail;
+    }
+    be32_to_cpus(&header.magic);
+    be32_to_cpus(&header.version);
+    be64_to_cpus(&header.backing_file_offset);
+    be32_to_cpus(&header.backing_file_size);
+    be64_to_cpus(&header.size);
+    be32_to_cpus(&header.cluster_bits);
+    be32_to_cpus(&header.crypt_method);
+    be64_to_cpus(&header.l1_table_offset);
+    be32_to_cpus(&header.l1_size);
+    be64_to_cpus(&header.refcount_table_offset);
+    be32_to_cpus(&header.refcount_table_clusters);
+    be64_to_cpus(&header.snapshots_offset);
+    be32_to_cpus(&header.nb_snapshots);
+
+    if (header.magic != QCOW_MAGIC) {
+        ret = -EMEDIUMTYPE;
+        goto fail;
+    }
+    if (header.version < 2 || header.version > 3) {
+        report_unsupported(bs, "QCOW version %d", header.version);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    s->qcow_version = header.version;
+
+    /* Initialise version 3 header fields */
+    if (header.version == 2) {
+        header.incompatible_features    = 0;
+        header.compatible_features      = 0;
+        header.autoclear_features       = 0;
+        header.refcount_order           = 4;
+        header.header_length            = 72;
+    } else {
+        be64_to_cpus(&header.incompatible_features);
+        be64_to_cpus(&header.compatible_features);
+        be64_to_cpus(&header.autoclear_features);
+        be32_to_cpus(&header.refcount_order);
+        be32_to_cpus(&header.header_length);
+    }
+
+    if (header.header_length > sizeof(header)) {
+        s->unknown_header_fields_size = header.header_length - sizeof(header);
+        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
+        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
+                         s->unknown_header_fields_size);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    if (header.backing_file_offset) {
+        ext_end = header.backing_file_offset;
+    } else {
+        ext_end = 1 << header.cluster_bits;
+    }
+
+    /* Handle feature bits */
+    s->incompatible_features    = header.incompatible_features;
+    s->compatible_features      = header.compatible_features;
+    s->autoclear_features       = header.autoclear_features;
+
+    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
+        void *feature_table = NULL;
+        qcow2_read_extensions(bs, header.header_length, ext_end,
+                              &feature_table);
+        report_unsupported_feature(bs, feature_table,
+                                   s->incompatible_features &
+                                   ~QCOW2_INCOMPAT_MASK);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    /* Check support for various header values */
+    if (header.refcount_order != 4) {
+        report_unsupported(bs, "%d bit reference counts",
+                           1 << header.refcount_order);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    if (header.cluster_bits < MIN_CLUSTER_BITS ||
+        header.cluster_bits > MAX_CLUSTER_BITS) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    if (header.crypt_method > QCOW_CRYPT_AES) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    s->crypt_method_header = header.crypt_method;
+    if (s->crypt_method_header) {
+        bs->encrypted = 1;
+    }
+    s->cluster_bits = header.cluster_bits;
+    s->cluster_size = 1 << s->cluster_bits;
+    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
+    s->l2_size = 1 << s->l2_bits;
+    bs->total_sectors = header.size / 512;
+    s->csize_shift = (62 - (s->cluster_bits - 8));
+    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
+    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
+    s->refcount_table_offset = header.refcount_table_offset;
+    s->refcount_table_size =
+        header.refcount_table_clusters << (s->cluster_bits - 3);
+
+    s->snapshots_offset = header.snapshots_offset;
+    s->nb_snapshots = header.nb_snapshots;
+
+    /* read the level 1 table */
+    s->l1_size = header.l1_size;
+
+    l1_vm_state_index = size_to_l1(s, header.size);
+    if (l1_vm_state_index > INT_MAX) {
+        ret = -EFBIG;
+        goto fail;
+    }
+    s->l1_vm_state_index = l1_vm_state_index;
+
+    /* the L1 table must contain at least enough entries to put
+       header.size bytes */
+    if (s->l1_size < s->l1_vm_state_index) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    s->l1_table_offset = header.l1_table_offset;
+    if (s->l1_size > 0) {
+        s->l1_table = g_malloc0(
+            align_offset(s->l1_size * sizeof(uint64_t), 512));
+        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
+                         s->l1_size * sizeof(uint64_t));
+        if (ret < 0) {
+            goto fail;
+        }
+        for(i = 0;i < s->l1_size; i++) {
+            be64_to_cpus(&s->l1_table[i]);
+        }
+    }
+
+    /* alloc L2 table/refcount block cache */
+    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
+    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
+
+    s->cluster_cache = g_malloc(s->cluster_size);
+    /* one more sector for decompressed data alignment */
+    s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
+                                  + 512);
+    s->cluster_cache_offset = -1;
+    s->flags = flags;
+
+    ret = qcow2_refcount_init(bs);
+    if (ret != 0) {
+        goto fail;
+    }
+
+    QLIST_INIT(&s->cluster_allocs);
+    QTAILQ_INIT(&s->discards);
+
+    /* read qcow2 extensions */
+    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* read the backing file name */
+    if (header.backing_file_offset != 0) {
+        len = header.backing_file_size;
+        if (len > 1023) {
+            len = 1023;
+        }
+        ret = bdrv_pread(bs->file, header.backing_file_offset,
+                         bs->backing_file, len);
+        if (ret < 0) {
+            goto fail;
+        }
+        bs->backing_file[len] = '\0';
+    }
+
+    ret = qcow2_read_snapshots(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Clear unknown autoclear feature bits */
+    if (!bs->read_only && s->autoclear_features != 0) {
+        s->autoclear_features = 0;
+        ret = qcow2_update_header(bs);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    /* Initialise locks */
+    qemu_co_mutex_init(&s->lock);
+
+    /* Repair image if dirty */
+    if (!(flags & BDRV_O_CHECK) && !bs->read_only &&
+        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
+        BdrvCheckResult result = {0};
+
+        ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    /* Enable lazy_refcounts according to image and command line options */
+    opts = qemu_opts_create_nofail(&qcow2_runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
+        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
+
+    s->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
+    s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
+    s->discard_passthrough[QCOW2_DISCARD_REQUEST] =
+        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
+                          flags & BDRV_O_UNMAP);
+    s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
+        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
+    s->discard_passthrough[QCOW2_DISCARD_OTHER] =
+        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
+
+    qemu_opts_del(opts);
+
+    if (s->use_lazy_refcounts && s->qcow_version < 3) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require "
+            "a qcow2 image with at least qemu 1.1 compatibility level");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+#ifdef DEBUG_ALLOC
+    {
+        BdrvCheckResult result = {0};
+        qcow2_check_refcounts(bs, &result, 0);
+    }
+#endif
+    return ret;
+
+ fail:
+    g_free(s->unknown_header_fields);
+    cleanup_unknown_header_ext(bs);
+    qcow2_free_snapshots(bs);
+    qcow2_refcount_close(bs);
+    g_free(s->l1_table);
+    if (s->l2_table_cache) {
+        qcow2_cache_destroy(bs, s->l2_table_cache);
+    }
+    g_free(s->cluster_cache);
+    qemu_vfree(s->cluster_data);
+    return ret;
+}
+
+static int qcow2_set_key(BlockDriverState *bs, const char *key)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint8_t keybuf[16];
+    int len, i;
+
+    memset(keybuf, 0, 16);
+    len = strlen(key);
+    if (len > 16)
+        len = 16;
+    /* XXX: we could compress the chars to 7 bits to increase
+       entropy */
+    for(i = 0;i < len;i++) {
+        keybuf[i] = key[i];
+    }
+    s->crypt_method = s->crypt_method_header;
+
+    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+        return -1;
+    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+        return -1;
+#if 0
+    /* test */
+    {
+        uint8_t in[16];
+        uint8_t out[16];
+        uint8_t tmp[16];
+        for(i=0;i<16;i++)
+            in[i] = i;
+        AES_encrypt(in, tmp, &s->aes_encrypt_key);
+        AES_decrypt(tmp, out, &s->aes_decrypt_key);
+        for(i = 0; i < 16; i++)
+            printf(" %02x", tmp[i]);
+        printf("\n");
+        for(i = 0; i < 16; i++)
+            printf(" %02x", out[i]);
+        printf("\n");
+    }
+#endif
+    return 0;
+}
+
+/* We have nothing to do for QCOW2 reopen, stubs just return
+ * success */
+static int qcow2_reopen_prepare(BDRVReopenState *state,
+                                BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
+static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t cluster_offset;
+    int ret;
+
+    *pnum = nb_sectors;
+    /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
+     * can't pass them on today */
+    qemu_co_mutex_lock(&s->lock);
+    ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
+    qemu_co_mutex_unlock(&s->lock);
+    if (ret < 0) {
+        *pnum = 0;
+    }
+
+    return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO);
+}
+
+/* handle reading after the end of the backing file */
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+                  int64_t sector_num, int nb_sectors)
+{
+    int n1;
+    if ((sector_num + nb_sectors) <= bs->total_sectors)
+        return nb_sectors;
+    if (sector_num >= bs->total_sectors)
+        n1 = 0;
+    else
+        n1 = bs->total_sectors - sector_num;
+
+    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
+
+    return n1;
+}
+
+static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
+                          int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster, n1;
+    int ret;
+    int cur_nr_sectors; /* number of sectors in current iteration */
+    uint64_t cluster_offset = 0;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    uint8_t *cluster_data = NULL;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+
+    qemu_co_mutex_lock(&s->lock);
+
+    while (remaining_sectors != 0) {
+
+        /* prepare next request */
+        cur_nr_sectors = remaining_sectors;
+        if (s->crypt_method) {
+            cur_nr_sectors = MIN(cur_nr_sectors,
+                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+        }
+
+        ret = qcow2_get_cluster_offset(bs, sector_num << 9,
+            &cur_nr_sectors, &cluster_offset);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+
+        qemu_iovec_reset(&hd_qiov);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
+            cur_nr_sectors * 512);
+
+        switch (ret) {
+        case QCOW2_CLUSTER_UNALLOCATED:
+
+            if (bs->backing_hd) {
+                /* read from the base image */
+                n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
+                    sector_num, cur_nr_sectors);
+                if (n1 > 0) {
+                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+                    qemu_co_mutex_unlock(&s->lock);
+                    ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                        n1, &hd_qiov);
+                    qemu_co_mutex_lock(&s->lock);
+                    if (ret < 0) {
+                        goto fail;
+                    }
+                }
+            } else {
+                /* Note: in this case, no need to wait */
+                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+            }
+            break;
+
+        case QCOW2_CLUSTER_ZERO:
+            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+            break;
+
+        case QCOW2_CLUSTER_COMPRESSED:
+            /* add AIO support for compressed blocks ? */
+            ret = qcow2_decompress_cluster(bs, cluster_offset);
+            if (ret < 0) {
+                goto fail;
+            }
+
+            qemu_iovec_from_buf(&hd_qiov, 0,
+                s->cluster_cache + index_in_cluster * 512,
+                512 * cur_nr_sectors);
+            break;
+
+        case QCOW2_CLUSTER_NORMAL:
+            if ((cluster_offset & 511) != 0) {
+                ret = -EIO;
+                goto fail;
+            }
+
+            if (s->crypt_method) {
+                /*
+                 * For encrypted images, read everything into a temporary
+                 * contiguous buffer on which the AES functions can work.
+                 */
+                if (!cluster_data) {
+                    cluster_data =
+                        qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+                }
+
+                assert(cur_nr_sectors <=
+                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_add(&hd_qiov, cluster_data,
+                    512 * cur_nr_sectors);
+            }
+
+            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+            qemu_co_mutex_unlock(&s->lock);
+            ret = bdrv_co_readv(bs->file,
+                                (cluster_offset >> 9) + index_in_cluster,
+                                cur_nr_sectors, &hd_qiov);
+            qemu_co_mutex_lock(&s->lock);
+            if (ret < 0) {
+                goto fail;
+            }
+            if (s->crypt_method) {
+                qcow2_encrypt_sectors(s, sector_num,  cluster_data,
+                    cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
+                qemu_iovec_from_buf(qiov, bytes_done,
+                    cluster_data, 512 * cur_nr_sectors);
+            }
+            break;
+
+        default:
+            g_assert_not_reached();
+            ret = -EIO;
+            goto fail;
+        }
+
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * 512;
+    }
+    ret = 0;
+
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+
+    qemu_iovec_destroy(&hd_qiov);
+    qemu_vfree(cluster_data);
+
+    return ret;
+}
+
+static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
+                           int64_t sector_num,
+                           int remaining_sectors,
+                           QEMUIOVector *qiov)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+    int n_end;
+    int ret;
+    int cur_nr_sectors; /* number of sectors in current iteration */
+    uint64_t cluster_offset;
+    QEMUIOVector hd_qiov;
+    uint64_t bytes_done = 0;
+    uint8_t *cluster_data = NULL;
+    QCowL2Meta *l2meta = NULL;
+
+    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
+                                 remaining_sectors);
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+
+    s->cluster_cache_offset = -1; /* disable compressed cache */
+
+    qemu_co_mutex_lock(&s->lock);
+
+    while (remaining_sectors != 0) {
+
+        l2meta = NULL;
+
+        trace_qcow2_writev_start_part(qemu_coroutine_self());
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n_end = index_in_cluster + remaining_sectors;
+        if (s->crypt_method &&
+            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
+            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+        }
+
+        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
+            index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        assert((cluster_offset & 511) == 0);
+
+        qemu_iovec_reset(&hd_qiov);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
+            cur_nr_sectors * 512);
+
+        if (s->crypt_method) {
+            if (!cluster_data) {
+                cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
+                                                 s->cluster_size);
+            }
+
+            assert(hd_qiov.size <=
+                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
+
+            qcow2_encrypt_sectors(s, sector_num, cluster_data,
+                cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
+
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_add(&hd_qiov, cluster_data,
+                cur_nr_sectors * 512);
+        }
+
+        qemu_co_mutex_unlock(&s->lock);
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+        trace_qcow2_writev_data(qemu_coroutine_self(),
+                                (cluster_offset >> 9) + index_in_cluster);
+        ret = bdrv_co_writev(bs->file,
+                             (cluster_offset >> 9) + index_in_cluster,
+                             cur_nr_sectors, &hd_qiov);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        while (l2meta != NULL) {
+            QCowL2Meta *next;
+
+            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
+            if (ret < 0) {
+                goto fail;
+            }
+
+            /* Take the request off the list of running requests */
+            if (l2meta->nb_clusters != 0) {
+                QLIST_REMOVE(l2meta, next_in_flight);
+            }
+
+            qemu_co_queue_restart_all(&l2meta->dependent_requests);
+
+            next = l2meta->next;
+            g_free(l2meta);
+            l2meta = next;
+        }
+
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * 512;
+        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
+    }
+    ret = 0;
+
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+
+    while (l2meta != NULL) {
+        QCowL2Meta *next;
+
+        if (l2meta->nb_clusters != 0) {
+            QLIST_REMOVE(l2meta, next_in_flight);
+        }
+        qemu_co_queue_restart_all(&l2meta->dependent_requests);
+
+        next = l2meta->next;
+        g_free(l2meta);
+        l2meta = next;
+    }
+
+    qemu_iovec_destroy(&hd_qiov);
+    qemu_vfree(cluster_data);
+    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
+
+    return ret;
+}
+
+static void qcow2_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    g_free(s->l1_table);
+
+    qcow2_cache_flush(bs, s->l2_table_cache);
+    qcow2_cache_flush(bs, s->refcount_block_cache);
+
+    qcow2_mark_clean(bs);
+
+    qcow2_cache_destroy(bs, s->l2_table_cache);
+    qcow2_cache_destroy(bs, s->refcount_block_cache);
+
+    g_free(s->unknown_header_fields);
+    cleanup_unknown_header_ext(bs);
+
+    g_free(s->cluster_cache);
+    qemu_vfree(s->cluster_data);
+    qcow2_refcount_close(bs);
+    qcow2_free_snapshots(bs);
+}
+
+static void qcow2_invalidate_cache(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int flags = s->flags;
+    AES_KEY aes_encrypt_key;
+    AES_KEY aes_decrypt_key;
+    uint32_t crypt_method = 0;
+    QDict *options;
+
+    /*
+     * Backing files are read-only which makes all of their metadata immutable,
+     * that means we don't have to worry about reopening them here.
+     */
+
+    if (s->crypt_method) {
+        crypt_method = s->crypt_method;
+        memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
+        memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
+    }
+
+    qcow2_close(bs);
+
+    options = qdict_new();
+    qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS,
+              qbool_from_int(s->use_lazy_refcounts));
+
+    memset(s, 0, sizeof(BDRVQcowState));
+    qcow2_open(bs, options, flags);
+
+    QDECREF(options);
+
+    if (crypt_method) {
+        s->crypt_method = crypt_method;
+        memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
+        memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
+    }
+}
+
+static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
+    size_t len, size_t buflen)
+{
+    QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
+    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
+
+    if (buflen < ext_len) {
+        return -ENOSPC;
+    }
+
+    *ext_backing_fmt = (QCowExtension) {
+        .magic  = cpu_to_be32(magic),
+        .len    = cpu_to_be32(len),
+    };
+    memcpy(buf + sizeof(QCowExtension), s, len);
+
+    return ext_len;
+}
+
+/*
+ * Updates the qcow2 header, including the variable length parts of it, i.e.
+ * the backing file name and all extensions. qcow2 was not designed to allow
+ * such changes, so if we run out of space (we can only use the first cluster)
+ * this function may fail.
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int qcow2_update_header(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowHeader *header;
+    char *buf;
+    size_t buflen = s->cluster_size;
+    int ret;
+    uint64_t total_size;
+    uint32_t refcount_table_clusters;
+    size_t header_length;
+    Qcow2UnknownHeaderExtension *uext;
+
+    buf = qemu_blockalign(bs, buflen);
+
+    /* Header structure */
+    header = (QCowHeader*) buf;
+
+    if (buflen < sizeof(*header)) {
+        ret = -ENOSPC;
+        goto fail;
+    }
+
+    header_length = sizeof(*header) + s->unknown_header_fields_size;
+    total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
+
+    *header = (QCowHeader) {
+        /* Version 2 fields */
+        .magic                  = cpu_to_be32(QCOW_MAGIC),
+        .version                = cpu_to_be32(s->qcow_version),
+        .backing_file_offset    = 0,
+        .backing_file_size      = 0,
+        .cluster_bits           = cpu_to_be32(s->cluster_bits),
+        .size                   = cpu_to_be64(total_size),
+        .crypt_method           = cpu_to_be32(s->crypt_method_header),
+        .l1_size                = cpu_to_be32(s->l1_size),
+        .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
+        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
+        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
+        .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
+        .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
+
+        /* Version 3 fields */
+        .incompatible_features  = cpu_to_be64(s->incompatible_features),
+        .compatible_features    = cpu_to_be64(s->compatible_features),
+        .autoclear_features     = cpu_to_be64(s->autoclear_features),
+        .refcount_order         = cpu_to_be32(3 + REFCOUNT_SHIFT),
+        .header_length          = cpu_to_be32(header_length),
+    };
+
+    /* For older versions, write a shorter header */
+    switch (s->qcow_version) {
+    case 2:
+        ret = offsetof(QCowHeader, incompatible_features);
+        break;
+    case 3:
+        ret = sizeof(*header);
+        break;
+    default:
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    buf += ret;
+    buflen -= ret;
+    memset(buf, 0, buflen);
+
+    /* Preserve any unknown field in the header */
+    if (s->unknown_header_fields_size) {
+        if (buflen < s->unknown_header_fields_size) {
+            ret = -ENOSPC;
+            goto fail;
+        }
+
+        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
+        buf += s->unknown_header_fields_size;
+        buflen -= s->unknown_header_fields_size;
+    }
+
+    /* Backing file format header extension */
+    if (*bs->backing_format) {
+        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
+                             bs->backing_format, strlen(bs->backing_format),
+                             buflen);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        buf += ret;
+        buflen -= ret;
+    }
+
+    /* Feature table */
+    Qcow2Feature features[] = {
+        {
+            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
+            .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
+            .name = "dirty bit",
+        },
+        {
+            .type = QCOW2_FEAT_TYPE_COMPATIBLE,
+            .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
+            .name = "lazy refcounts",
+        },
+    };
+
+    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
+                         features, sizeof(features), buflen);
+    if (ret < 0) {
+        goto fail;
+    }
+    buf += ret;
+    buflen -= ret;
+
+    /* Keep unknown header extensions */
+    QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
+        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        buf += ret;
+        buflen -= ret;
+    }
+
+    /* End of header extensions */
+    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    buf += ret;
+    buflen -= ret;
+
+    /* Backing file name */
+    if (*bs->backing_file) {
+        size_t backing_file_len = strlen(bs->backing_file);
+
+        if (buflen < backing_file_len) {
+            ret = -ENOSPC;
+            goto fail;
+        }
+
+        /* Using strncpy is ok here, since buf is not NUL-terminated. */
+        strncpy(buf, bs->backing_file, buflen);
+
+        header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
+        header->backing_file_size   = cpu_to_be32(backing_file_len);
+    }
+
+    /* Write the new header */
+    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = 0;
+fail:
+    qemu_vfree(header);
+    return ret;
+}
+
+static int qcow2_change_backing_file(BlockDriverState *bs,
+    const char *backing_file, const char *backing_fmt)
+{
+    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
+    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
+
+    return qcow2_update_header(bs);
+}
+
+static int preallocate(BlockDriverState *bs)
+{
+    uint64_t nb_sectors;
+    uint64_t offset;
+    uint64_t host_offset = 0;
+    int num;
+    int ret;
+    QCowL2Meta *meta;
+
+    nb_sectors = bdrv_getlength(bs) >> 9;
+    offset = 0;
+
+    while (nb_sectors) {
+        num = MIN(nb_sectors, INT_MAX >> 9);
+        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
+                                         &host_offset, &meta);
+        if (ret < 0) {
+            return ret;
+        }
+
+        ret = qcow2_alloc_cluster_link_l2(bs, meta);
+        if (ret < 0) {
+            qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters,
+                                    QCOW2_DISCARD_NEVER);
+            return ret;
+        }
+
+        /* There are no dependent requests, but we need to remove our request
+         * from the list of in-flight requests */
+        if (meta != NULL) {
+            QLIST_REMOVE(meta, next_in_flight);
+        }
+
+        /* TODO Preallocate data if requested */
+
+        nb_sectors -= num;
+        offset += num << 9;
+    }
+
+    /*
+     * It is expected that the image file is large enough to actually contain
+     * all of the allocated clusters (otherwise we get failing reads after
+     * EOF). Extend the image to the last allocated sector.
+     */
+    if (host_offset != 0) {
+        uint8_t buf[512];
+        memset(buf, 0, 512);
+        ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+static int qcow2_create2(const char *filename, int64_t total_size,
+                         const char *backing_file, const char *backing_format,
+                         int flags, size_t cluster_size, int prealloc,
+                         QEMUOptionParameter *options, int version)
+{
+    /* Calculate cluster_bits */
+    int cluster_bits;
+    cluster_bits = ffs(cluster_size) - 1;
+    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
+        (1 << cluster_bits) != cluster_size)
+    {
+        error_report(
+            "Cluster size must be a power of two between %d and %dk",
+            1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
+        return -EINVAL;
+    }
+
+    /*
+     * Open the image file and write a minimal qcow2 header.
+     *
+     * We keep things simple and start with a zero-sized image. We also
+     * do without refcount blocks or a L1 table for now. We'll fix the
+     * inconsistency later.
+     *
+     * We do need a refcount table because growing the refcount table means
+     * allocating two new refcount blocks - the seconds of which would be at
+     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
+     * size for any qcow2 image.
+     */
+    BlockDriverState* bs;
+    QCowHeader header;
+    uint8_t* refcount_table;
+    int ret;
+
+    ret = bdrv_create_file(filename, options);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Write the header */
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be32(QCOW_MAGIC);
+    header.version = cpu_to_be32(version);
+    header.cluster_bits = cpu_to_be32(cluster_bits);
+    header.size = cpu_to_be64(0);
+    header.l1_table_offset = cpu_to_be64(0);
+    header.l1_size = cpu_to_be32(0);
+    header.refcount_table_offset = cpu_to_be64(cluster_size);
+    header.refcount_table_clusters = cpu_to_be32(1);
+    header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
+    header.header_length = cpu_to_be32(sizeof(header));
+
+    if (flags & BLOCK_FLAG_ENCRYPT) {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+    } else {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+    }
+
+    if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
+        header.compatible_features |=
+            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
+    }
+
+    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* Write an empty refcount table */
+    refcount_table = g_malloc0(cluster_size);
+    ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
+    g_free(refcount_table);
+
+    if (ret < 0) {
+        goto out;
+    }
+
+    bdrv_close(bs);
+
+    /*
+     * And now open the image and make it consistent first (i.e. increase the
+     * refcount of the cluster that is occupied by the header and the refcount
+     * table)
+     */
+    BlockDriver* drv = bdrv_find_format("qcow2");
+    assert(drv != NULL);
+    ret = bdrv_open(bs, filename, NULL,
+        BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
+    if (ret < 0) {
+        goto out;
+
+    } else if (ret != 0) {
+        error_report("Huh, first cluster in empty image is already in use?");
+        abort();
+    }
+
+    /* Okay, now that we have a valid image, let's give it the right size */
+    ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* Want a backing file? There you go.*/
+    if (backing_file) {
+        ret = bdrv_change_backing_file(bs, backing_file, backing_format);
+        if (ret < 0) {
+            goto out;
+        }
+    }
+
+    /* And if we're supposed to preallocate metadata, do that now */
+    if (prealloc) {
+        BDRVQcowState *s = bs->opaque;
+        qemu_co_mutex_lock(&s->lock);
+        ret = preallocate(bs);
+        qemu_co_mutex_unlock(&s->lock);
+        if (ret < 0) {
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    bdrv_delete(bs);
+    return ret;
+}
+
+static int qcow2_create(const char *filename, QEMUOptionParameter *options)
+{
+    const char *backing_file = NULL;
+    const char *backing_fmt = NULL;
+    uint64_t sectors = 0;
+    int flags = 0;
+    size_t cluster_size = DEFAULT_CLUSTER_SIZE;
+    int prealloc = 0;
+    int version = 2;
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            sectors = options->value.n / 512;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_file = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+            backing_fmt = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
+            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
+        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+            if (options->value.n) {
+                cluster_size = options->value.n;
+            }
+        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
+            if (!options->value.s || !strcmp(options->value.s, "off")) {
+                prealloc = 0;
+            } else if (!strcmp(options->value.s, "metadata")) {
+                prealloc = 1;
+            } else {
+                fprintf(stderr, "Invalid preallocation mode: '%s'\n",
+                    options->value.s);
+                return -EINVAL;
+            }
+        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
+            if (!options->value.s || !strcmp(options->value.s, "0.10")) {
+                version = 2;
+            } else if (!strcmp(options->value.s, "1.1")) {
+                version = 3;
+            } else {
+                fprintf(stderr, "Invalid compatibility level: '%s'\n",
+                    options->value.s);
+                return -EINVAL;
+            }
+        } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
+            flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
+        }
+        options++;
+    }
+
+    if (backing_file && prealloc) {
+        fprintf(stderr, "Backing file and preallocation cannot be used at "
+            "the same time\n");
+        return -EINVAL;
+    }
+
+    if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
+        fprintf(stderr, "Lazy refcounts only supported with compatibility "
+                "level 1.1 and above (use compat=1.1 or greater)\n");
+        return -EINVAL;
+    }
+
+    return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
+                         cluster_size, prealloc, options, version);
+}
+
+static int qcow2_make_empty(BlockDriverState *bs)
+{
+#if 0
+    /* XXX: not correct */
+    BDRVQcowState *s = bs->opaque;
+    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+    int ret;
+
+    memset(s->l1_table, 0, l1_length);
+    if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
+        return -1;
+    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
+    if (ret < 0)
+        return ret;
+
+    l2_cache_reset(bs);
+#endif
+    return 0;
+}
+
+static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors)
+{
+    int ret;
+    BDRVQcowState *s = bs->opaque;
+
+    /* Emulate misaligned zero writes */
+    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
+        return -ENOTSUP;
+    }
+
+    /* Whatever is left can use real zero clusters */
+    qemu_co_mutex_lock(&s->lock);
+    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
+        nb_sectors);
+    qemu_co_mutex_unlock(&s->lock);
+
+    return ret;
+}
+
+static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors)
+{
+    int ret;
+    BDRVQcowState *s = bs->opaque;
+
+    qemu_co_mutex_lock(&s->lock);
+    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
+        nb_sectors);
+    qemu_co_mutex_unlock(&s->lock);
+    return ret;
+}
+
+static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t new_l1_size;
+    int ret;
+
+    if (offset & 511) {
+        error_report("The new size must be a multiple of 512");
+        return -EINVAL;
+    }
+
+    /* cannot proceed if image has snapshots */
+    if (s->nb_snapshots) {
+        error_report("Can't resize an image which has snapshots");
+        return -ENOTSUP;
+    }
+
+    /* shrinking is currently not supported */
+    if (offset < bs->total_sectors * 512) {
+        error_report("qcow2 doesn't support shrinking images yet");
+        return -ENOTSUP;
+    }
+
+    new_l1_size = size_to_l1(s, offset);
+    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* write updated header.size */
+    offset = cpu_to_be64(offset);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
+                           &offset, sizeof(uint64_t));
+    if (ret < 0) {
+        return ret;
+    }
+
+    s->l1_vm_state_index = new_l1_size;
+    return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                                  const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    z_stream strm;
+    int ret, out_len;
+    uint8_t *out_buf;
+    uint64_t cluster_offset;
+
+    if (nb_sectors == 0) {
+        /* align end of file to a sector boundary to ease reading with
+           sector based I/Os */
+        cluster_offset = bdrv_getlength(bs->file);
+        cluster_offset = (cluster_offset + 511) & ~511;
+        bdrv_truncate(bs->file, cluster_offset);
+        return 0;
+    }
+
+    if (nb_sectors != s->cluster_sectors) {
+        ret = -EINVAL;
+
+        /* Zero-pad last write if image size is not cluster aligned */
+        if (sector_num + nb_sectors == bs->total_sectors &&
+            nb_sectors < s->cluster_sectors) {
+            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+            memset(pad_buf, 0, s->cluster_size);
+            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+            ret = qcow2_write_compressed(bs, sector_num,
+                                         pad_buf, s->cluster_sectors);
+            qemu_vfree(pad_buf);
+        }
+        return ret;
+    }
+
+    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+
+    /* best compression, small window, no zlib header */
+    memset(&strm, 0, sizeof(strm));
+    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                       Z_DEFLATED, -12,
+                       9, Z_DEFAULT_STRATEGY);
+    if (ret != 0) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    strm.avail_in = s->cluster_size;
+    strm.next_in = (uint8_t *)buf;
+    strm.avail_out = s->cluster_size;
+    strm.next_out = out_buf;
+
+    ret = deflate(&strm, Z_FINISH);
+    if (ret != Z_STREAM_END && ret != Z_OK) {
+        deflateEnd(&strm);
+        ret = -EINVAL;
+        goto fail;
+    }
+    out_len = strm.next_out - out_buf;
+
+    deflateEnd(&strm);
+
+    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+        /* could not compress: write normal cluster */
+        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+        if (ret < 0) {
+            goto fail;
+        }
+    } else {
+        cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
+            sector_num << 9, out_len);
+        if (!cluster_offset) {
+            ret = -EIO;
+            goto fail;
+        }
+        cluster_offset &= s->cluster_offset_mask;
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
+        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    ret = 0;
+fail:
+    g_free(out_buf);
+    return ret;
+}
+
+static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    qemu_co_mutex_lock(&s->lock);
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        qemu_co_mutex_unlock(&s->lock);
+        return ret;
+    }
+
+    if (qcow2_need_accurate_refcounts(s)) {
+        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+        if (ret < 0) {
+            qemu_co_mutex_unlock(&s->lock);
+            return ret;
+        }
+    }
+    qemu_co_mutex_unlock(&s->lock);
+
+    return 0;
+}
+
+static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
+{
+	return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
+}
+
+static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdi->cluster_size = s->cluster_size;
+    bdi->vm_state_offset = qcow2_vm_state_offset(s);
+    return 0;
+}
+
+#if 0
+static void dump_refcounts(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t nb_clusters, k, k1, size;
+    int refcount;
+
+    size = bdrv_getlength(bs->file);
+    nb_clusters = size_to_clusters(s, size);
+    for(k = 0; k < nb_clusters;) {
+        k1 = k;
+        refcount = get_refcount(bs, k);
+        k++;
+        while (k < nb_clusters && get_refcount(bs, k) == refcount)
+            k++;
+        printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
+               k - k1);
+    }
+}
+#endif
+
+static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                              int64_t pos)
+{
+    BDRVQcowState *s = bs->opaque;
+    int growable = bs->growable;
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
+    bs->growable = 1;
+    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
+    bs->growable = growable;
+
+    return ret;
+}
+
+static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+                              int64_t pos, int size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int growable = bs->growable;
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
+    bs->growable = 1;
+    ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
+    bs->growable = growable;
+
+    return ret;
+}
+
+static QEMUOptionParameter qcow2_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_COMPAT_LEVEL,
+        .type = OPT_STRING,
+        .help = "Compatibility level (0.10 or 1.1)"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FMT,
+        .type = OPT_STRING,
+        .help = "Image format of the base image"
+    },
+    {
+        .name = BLOCK_OPT_ENCRYPT,
+        .type = OPT_FLAG,
+        .help = "Encrypt the image"
+    },
+    {
+        .name = BLOCK_OPT_CLUSTER_SIZE,
+        .type = OPT_SIZE,
+        .help = "qcow2 cluster size",
+        .value = { .n = DEFAULT_CLUSTER_SIZE },
+    },
+    {
+        .name = BLOCK_OPT_PREALLOC,
+        .type = OPT_STRING,
+        .help = "Preallocation mode (allowed values: off, metadata)"
+    },
+    {
+        .name = BLOCK_OPT_LAZY_REFCOUNTS,
+        .type = OPT_FLAG,
+        .help = "Postpone refcount updates",
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_qcow2 = {
+    .format_name        = "qcow2",
+    .instance_size      = sizeof(BDRVQcowState),
+    .bdrv_probe         = qcow2_probe,
+    .bdrv_open          = qcow2_open,
+    .bdrv_close         = qcow2_close,
+    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
+    .bdrv_create        = qcow2_create,
+    .bdrv_has_zero_init = bdrv_has_zero_init_1,
+    .bdrv_co_is_allocated = qcow2_co_is_allocated,
+    .bdrv_set_key       = qcow2_set_key,
+    .bdrv_make_empty    = qcow2_make_empty,
+
+    .bdrv_co_readv          = qcow2_co_readv,
+    .bdrv_co_writev         = qcow2_co_writev,
+    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
+
+    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
+    .bdrv_co_discard        = qcow2_co_discard,
+    .bdrv_truncate          = qcow2_truncate,
+    .bdrv_write_compressed  = qcow2_write_compressed,
+
+    .bdrv_snapshot_create   = qcow2_snapshot_create,
+    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
+    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
+    .bdrv_snapshot_list     = qcow2_snapshot_list,
+    .bdrv_snapshot_load_tmp     = qcow2_snapshot_load_tmp,
+    .bdrv_get_info      = qcow2_get_info,
+
+    .bdrv_save_vmstate    = qcow2_save_vmstate,
+    .bdrv_load_vmstate    = qcow2_load_vmstate,
+
+    .bdrv_change_backing_file   = qcow2_change_backing_file,
+
+    .bdrv_invalidate_cache      = qcow2_invalidate_cache,
+
+    .create_options = qcow2_create_options,
+    .bdrv_check = qcow2_check,
+};
+
+static void bdrv_qcow2_init(void)
+{
+    bdrv_register(&bdrv_qcow2);
+}
+
+block_init(bdrv_qcow2_init);
diff --git a/contrib/qemu/block/qcow2.h b/contrib/qemu/block/qcow2.h
new file mode 100644
index 000000000..3b2d5cda7
--- /dev/null
+++ b/contrib/qemu/block/qcow2.h
@@ -0,0 +1,437 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_QCOW2_H
+#define BLOCK_QCOW2_H
+
+#include "qemu/aes.h"
+#include "block/coroutine.h"
+
+//#define DEBUG_ALLOC
+//#define DEBUG_ALLOC2
+//#define DEBUG_EXT
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES  1
+
+#define QCOW_MAX_CRYPT_CLUSTERS 32
+
+/* indicate that the refcount of the referenced cluster is exactly one. */
+#define QCOW_OFLAG_COPIED     (1LL << 63)
+/* indicate that the cluster is compressed (they never have the copied flag) */
+#define QCOW_OFLAG_COMPRESSED (1LL << 62)
+/* The cluster reads as all zeros */
+#define QCOW_OFLAG_ZERO (1LL << 0)
+
+#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
+
+#define MIN_CLUSTER_BITS 9
+#define MAX_CLUSTER_BITS 21
+
+#define L2_CACHE_SIZE 16
+
+/* Must be at least 4 to cover all cases of refcount table growth */
+#define REFCOUNT_CACHE_SIZE 4
+
+#define DEFAULT_CLUSTER_SIZE 65536
+
+
+#define QCOW2_OPT_LAZY_REFCOUNTS "lazy_refcounts"
+#define QCOW2_OPT_DISCARD_REQUEST "pass_discard_request"
+#define QCOW2_OPT_DISCARD_SNAPSHOT "pass_discard_snapshot"
+#define QCOW2_OPT_DISCARD_OTHER "pass_discard_other"
+
+typedef struct QCowHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t backing_file_offset;
+    uint32_t backing_file_size;
+    uint32_t cluster_bits;
+    uint64_t size; /* in bytes */
+    uint32_t crypt_method;
+    uint32_t l1_size; /* XXX: save number of clusters instead ? */
+    uint64_t l1_table_offset;
+    uint64_t refcount_table_offset;
+    uint32_t refcount_table_clusters;
+    uint32_t nb_snapshots;
+    uint64_t snapshots_offset;
+
+    /* The following fields are only valid for version >= 3 */
+    uint64_t incompatible_features;
+    uint64_t compatible_features;
+    uint64_t autoclear_features;
+
+    uint32_t refcount_order;
+    uint32_t header_length;
+} QCowHeader;
+
+typedef struct QCowSnapshot {
+    uint64_t l1_table_offset;
+    uint32_t l1_size;
+    char *id_str;
+    char *name;
+    uint64_t disk_size;
+    uint64_t vm_state_size;
+    uint32_t date_sec;
+    uint32_t date_nsec;
+    uint64_t vm_clock_nsec;
+} QCowSnapshot;
+
+struct Qcow2Cache;
+typedef struct Qcow2Cache Qcow2Cache;
+
+typedef struct Qcow2UnknownHeaderExtension {
+    uint32_t magic;
+    uint32_t len;
+    QLIST_ENTRY(Qcow2UnknownHeaderExtension) next;
+    uint8_t data[];
+} Qcow2UnknownHeaderExtension;
+
+enum {
+    QCOW2_FEAT_TYPE_INCOMPATIBLE    = 0,
+    QCOW2_FEAT_TYPE_COMPATIBLE      = 1,
+    QCOW2_FEAT_TYPE_AUTOCLEAR       = 2,
+};
+
+/* Incompatible feature bits */
+enum {
+    QCOW2_INCOMPAT_DIRTY_BITNR   = 0,
+    QCOW2_INCOMPAT_DIRTY         = 1 << QCOW2_INCOMPAT_DIRTY_BITNR,
+
+    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY,
+};
+
+/* Compatible feature bits */
+enum {
+    QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0,
+    QCOW2_COMPAT_LAZY_REFCOUNTS       = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
+
+    QCOW2_COMPAT_FEAT_MASK            = QCOW2_COMPAT_LAZY_REFCOUNTS,
+};
+
+enum qcow2_discard_type {
+    QCOW2_DISCARD_NEVER = 0,
+    QCOW2_DISCARD_ALWAYS,
+    QCOW2_DISCARD_REQUEST,
+    QCOW2_DISCARD_SNAPSHOT,
+    QCOW2_DISCARD_OTHER,
+    QCOW2_DISCARD_MAX
+};
+
+typedef struct Qcow2Feature {
+    uint8_t type;
+    uint8_t bit;
+    char    name[46];
+} QEMU_PACKED Qcow2Feature;
+
+typedef struct Qcow2DiscardRegion {
+    BlockDriverState *bs;
+    uint64_t offset;
+    uint64_t bytes;
+    QTAILQ_ENTRY(Qcow2DiscardRegion) next;
+} Qcow2DiscardRegion;
+
+typedef struct BDRVQcowState {
+    int cluster_bits;
+    int cluster_size;
+    int cluster_sectors;
+    int l2_bits;
+    int l2_size;
+    int l1_size;
+    int l1_vm_state_index;
+    int csize_shift;
+    int csize_mask;
+    uint64_t cluster_offset_mask;
+    uint64_t l1_table_offset;
+    uint64_t *l1_table;
+
+    Qcow2Cache* l2_table_cache;
+    Qcow2Cache* refcount_block_cache;
+
+    uint8_t *cluster_cache;
+    uint8_t *cluster_data;
+    uint64_t cluster_cache_offset;
+    QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs;
+
+    uint64_t *refcount_table;
+    uint64_t refcount_table_offset;
+    uint32_t refcount_table_size;
+    int64_t free_cluster_index;
+    int64_t free_byte_offset;
+
+    CoMutex lock;
+
+    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+    uint32_t crypt_method_header;
+    AES_KEY aes_encrypt_key;
+    AES_KEY aes_decrypt_key;
+    uint64_t snapshots_offset;
+    int snapshots_size;
+    int nb_snapshots;
+    QCowSnapshot *snapshots;
+
+    int flags;
+    int qcow_version;
+    bool use_lazy_refcounts;
+
+    bool discard_passthrough[QCOW2_DISCARD_MAX];
+
+    uint64_t incompatible_features;
+    uint64_t compatible_features;
+    uint64_t autoclear_features;
+
+    size_t unknown_header_fields_size;
+    void* unknown_header_fields;
+    QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext;
+    QTAILQ_HEAD (, Qcow2DiscardRegion) discards;
+    bool cache_discards;
+} BDRVQcowState;
+
+/* XXX: use std qcow open function ? */
+typedef struct QCowCreateState {
+    int cluster_size;
+    int cluster_bits;
+    uint16_t *refcount_block;
+    uint64_t *refcount_table;
+    int64_t l1_table_offset;
+    int64_t refcount_table_offset;
+    int64_t refcount_block_offset;
+} QCowCreateState;
+
+struct QCowAIOCB;
+
+typedef struct Qcow2COWRegion {
+    /**
+     * Offset of the COW region in bytes from the start of the first cluster
+     * touched by the request.
+     */
+    uint64_t    offset;
+
+    /** Number of sectors to copy */
+    int         nb_sectors;
+} Qcow2COWRegion;
+
+/**
+ * Describes an in-flight (part of a) write request that writes to clusters
+ * that are not referenced in their L2 table yet.
+ */
+typedef struct QCowL2Meta
+{
+    /** Guest offset of the first newly allocated cluster */
+    uint64_t offset;
+
+    /** Host offset of the first newly allocated cluster */
+    uint64_t alloc_offset;
+
+    /**
+     * Number of sectors from the start of the first allocated cluster to
+     * the end of the (possibly shortened) request
+     */
+    int nb_available;
+
+    /** Number of newly allocated clusters */
+    int nb_clusters;
+
+    /**
+     * Requests that overlap with this allocation and wait to be restarted
+     * when the allocating request has completed.
+     */
+    CoQueue dependent_requests;
+
+    /**
+     * The COW Region between the start of the first allocated cluster and the
+     * area the guest actually writes to.
+     */
+    Qcow2COWRegion cow_start;
+
+    /**
+     * The COW Region between the area the guest actually writes to and the
+     * end of the last allocated cluster.
+     */
+    Qcow2COWRegion cow_end;
+
+    /** Pointer to next L2Meta of the same write request */
+    struct QCowL2Meta *next;
+
+    QLIST_ENTRY(QCowL2Meta) next_in_flight;
+} QCowL2Meta;
+
+enum {
+    QCOW2_CLUSTER_UNALLOCATED,
+    QCOW2_CLUSTER_NORMAL,
+    QCOW2_CLUSTER_COMPRESSED,
+    QCOW2_CLUSTER_ZERO
+};
+
+#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL
+#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL
+#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
+
+#define REFT_OFFSET_MASK 0xffffffffffffff00ULL
+
+static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & ~(s->cluster_size - 1);
+}
+
+static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & (s->cluster_size - 1);
+}
+
+static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
+{
+    return (size + (s->cluster_size - 1)) >> s->cluster_bits;
+}
+
+static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size)
+{
+    int shift = s->cluster_bits + s->l2_bits;
+    return (size + (1ULL << shift) - 1) >> shift;
+}
+
+static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset)
+{
+    return (offset >> s->cluster_bits) & (s->l2_size - 1);
+}
+
+static inline int64_t align_offset(int64_t offset, int n)
+{
+    offset = (offset + n - 1) & ~(n - 1);
+    return offset;
+}
+
+static inline int qcow2_get_cluster_type(uint64_t l2_entry)
+{
+    if (l2_entry & QCOW_OFLAG_COMPRESSED) {
+        return QCOW2_CLUSTER_COMPRESSED;
+    } else if (l2_entry & QCOW_OFLAG_ZERO) {
+        return QCOW2_CLUSTER_ZERO;
+    } else if (!(l2_entry & L2E_OFFSET_MASK)) {
+        return QCOW2_CLUSTER_UNALLOCATED;
+    } else {
+        return QCOW2_CLUSTER_NORMAL;
+    }
+}
+
+/* Check whether refcounts are eager or lazy */
+static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s)
+{
+    return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY);
+}
+
+static inline uint64_t l2meta_cow_start(QCowL2Meta *m)
+{
+    return m->offset + m->cow_start.offset;
+}
+
+static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
+{
+    return m->offset + m->cow_end.offset
+        + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
+}
+
+// FIXME Need qcow2_ prefix to global functions
+
+/* qcow2.c functions */
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+                  int64_t sector_num, int nb_sectors);
+
+int qcow2_mark_dirty(BlockDriverState *bs);
+int qcow2_update_header(BlockDriverState *bs);
+
+/* qcow2-refcount.c functions */
+int qcow2_refcount_init(BlockDriverState *bs);
+void qcow2_refcount_close(BlockDriverState *bs);
+
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
+int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+    int nb_clusters);
+int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
+void qcow2_free_clusters(BlockDriverState *bs,
+                          int64_t offset, int64_t size,
+                          enum qcow2_discard_type type);
+void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
+                             int nb_clusters, enum qcow2_discard_type type);
+
+int qcow2_update_snapshot_refcount(BlockDriverState *bs,
+    int64_t l1_table_offset, int l1_size, int addend);
+
+int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                          BdrvCheckMode fix);
+
+void qcow2_process_discards(BlockDriverState *bs, int ret);
+
+/* qcow2-cluster.c functions */
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+                        bool exact_size);
+void qcow2_l2_cache_reset(BlockDriverState *bs);
+int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
+void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                     uint8_t *out_buf, const uint8_t *in_buf,
+                     int nb_sectors, int enc,
+                     const AES_KEY *key);
+
+int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+    int *num, uint64_t *cluster_offset);
+int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
+    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m);
+uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+                                         uint64_t offset,
+                                         int compressed_size);
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
+int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
+    int nb_sectors);
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);
+
+/* qcow2-snapshot.c functions */
+int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
+int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id);
+int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id);
+int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab);
+int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name);
+
+void qcow2_free_snapshots(BlockDriverState *bs);
+int qcow2_read_snapshots(BlockDriverState *bs);
+
+/* qcow2-cache.c functions */
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
+int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
+
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
+int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
+    Qcow2Cache *dependency);
+void qcow2_cache_depends_on_flush(Qcow2Cache *c);
+
+int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table);
+int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table);
+int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
+
+#endif
diff --git a/contrib/qemu/block/qed-check.c b/contrib/qemu/block/qed-check.c
new file mode 100644
index 000000000..b473dcd61
--- /dev/null
+++ b/contrib/qemu/block/qed-check.c
@@ -0,0 +1,248 @@
+/*
+ * QEMU Enhanced Disk Format Consistency Check
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+typedef struct {
+    BDRVQEDState *s;
+    BdrvCheckResult *result;
+    bool fix;                           /* whether to fix invalid offsets */
+
+    uint64_t nclusters;
+    uint32_t *used_clusters;            /* referenced cluster bitmap */
+
+    QEDRequest request;
+} QEDCheck;
+
+static bool qed_test_bit(uint32_t *bitmap, uint64_t n) {
+    return !!(bitmap[n / 32] & (1 << (n % 32)));
+}
+
+static void qed_set_bit(uint32_t *bitmap, uint64_t n) {
+    bitmap[n / 32] |= 1 << (n % 32);
+}
+
+/**
+ * Set bitmap bits for clusters
+ *
+ * @check:          Check structure
+ * @offset:         Starting offset in bytes
+ * @n:              Number of clusters
+ */
+static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset,
+                                  unsigned int n)
+{
+    uint64_t cluster = qed_bytes_to_clusters(check->s, offset);
+    unsigned int corruptions = 0;
+
+    while (n-- != 0) {
+        /* Clusters should only be referenced once */
+        if (qed_test_bit(check->used_clusters, cluster)) {
+            corruptions++;
+        }
+
+        qed_set_bit(check->used_clusters, cluster);
+        cluster++;
+    }
+
+    check->result->corruptions += corruptions;
+    return corruptions == 0;
+}
+
+/**
+ * Check an L2 table
+ *
+ * @ret:            Number of invalid cluster offsets
+ */
+static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table)
+{
+    BDRVQEDState *s = check->s;
+    unsigned int i, num_invalid = 0;
+    uint64_t last_offset = 0;
+
+    for (i = 0; i < s->table_nelems; i++) {
+        uint64_t offset = table->offsets[i];
+
+        if (qed_offset_is_unalloc_cluster(offset) ||
+            qed_offset_is_zero_cluster(offset)) {
+            continue;
+        }
+        check->result->bfi.allocated_clusters++;
+        if (last_offset && (last_offset + s->header.cluster_size != offset)) {
+            check->result->bfi.fragmented_clusters++;
+        }
+        last_offset = offset;
+
+        /* Detect invalid cluster offset */
+        if (!qed_check_cluster_offset(s, offset)) {
+            if (check->fix) {
+                table->offsets[i] = 0;
+                check->result->corruptions_fixed++;
+            } else {
+                check->result->corruptions++;
+            }
+
+            num_invalid++;
+            continue;
+        }
+
+        qed_set_used_clusters(check, offset, 1);
+    }
+
+    return num_invalid;
+}
+
+/**
+ * Descend tables and check each cluster is referenced once only
+ */
+static int qed_check_l1_table(QEDCheck *check, QEDTable *table)
+{
+    BDRVQEDState *s = check->s;
+    unsigned int i, num_invalid_l1 = 0;
+    int ret, last_error = 0;
+
+    /* Mark L1 table clusters used */
+    qed_set_used_clusters(check, s->header.l1_table_offset,
+                          s->header.table_size);
+
+    for (i = 0; i < s->table_nelems; i++) {
+        unsigned int num_invalid_l2;
+        uint64_t offset = table->offsets[i];
+
+        if (qed_offset_is_unalloc_cluster(offset)) {
+            continue;
+        }
+
+        /* Detect invalid L2 offset */
+        if (!qed_check_table_offset(s, offset)) {
+            /* Clear invalid offset */
+            if (check->fix) {
+                table->offsets[i] = 0;
+                check->result->corruptions_fixed++;
+            } else {
+                check->result->corruptions++;
+            }
+
+            num_invalid_l1++;
+            continue;
+        }
+
+        if (!qed_set_used_clusters(check, offset, s->header.table_size)) {
+            continue; /* skip an invalid table */
+        }
+
+        ret = qed_read_l2_table_sync(s, &check->request, offset);
+        if (ret) {
+            check->result->check_errors++;
+            last_error = ret;
+            continue;
+        }
+
+        num_invalid_l2 = qed_check_l2_table(check,
+                                            check->request.l2_table->table);
+
+        /* Write out fixed L2 table */
+        if (num_invalid_l2 > 0 && check->fix) {
+            ret = qed_write_l2_table_sync(s, &check->request, 0,
+                                          s->table_nelems, false);
+            if (ret) {
+                check->result->check_errors++;
+                last_error = ret;
+                continue;
+            }
+        }
+    }
+
+    /* Drop reference to final table */
+    qed_unref_l2_cache_entry(check->request.l2_table);
+    check->request.l2_table = NULL;
+
+    /* Write out fixed L1 table */
+    if (num_invalid_l1 > 0 && check->fix) {
+        ret = qed_write_l1_table_sync(s, 0, s->table_nelems);
+        if (ret) {
+            check->result->check_errors++;
+            last_error = ret;
+        }
+    }
+
+    return last_error;
+}
+
+/**
+ * Check for unreferenced (leaked) clusters
+ */
+static void qed_check_for_leaks(QEDCheck *check)
+{
+    BDRVQEDState *s = check->s;
+    uint64_t i;
+
+    for (i = s->header.header_size; i < check->nclusters; i++) {
+        if (!qed_test_bit(check->used_clusters, i)) {
+            check->result->leaks++;
+        }
+    }
+}
+
+/**
+ * Mark an image clean once it passes check or has been repaired
+ */
+static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result)
+{
+    /* Skip if there were unfixable corruptions or I/O errors */
+    if (result->corruptions > 0 || result->check_errors > 0) {
+        return;
+    }
+
+    /* Skip if image is already marked clean */
+    if (!(s->header.features & QED_F_NEED_CHECK)) {
+        return;
+    }
+
+    /* Ensure fixes reach storage before clearing check bit */
+    bdrv_flush(s->bs);
+
+    s->header.features &= ~QED_F_NEED_CHECK;
+    qed_write_header_sync(s);
+}
+
+int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
+{
+    QEDCheck check = {
+        .s = s,
+        .result = result,
+        .nclusters = qed_bytes_to_clusters(s, s->file_size),
+        .request = { .l2_table = NULL },
+        .fix = fix,
+    };
+    int ret;
+
+    check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) *
+                                       sizeof(check.used_clusters[0]));
+
+    check.result->bfi.total_clusters =
+        (s->header.image_size + s->header.cluster_size - 1) /
+            s->header.cluster_size;
+    ret = qed_check_l1_table(&check, s->l1_table);
+    if (ret == 0) {
+        /* Only check for leaks if entire image was scanned successfully */
+        qed_check_for_leaks(&check);
+
+        if (fix) {
+            qed_check_mark_clean(s, result);
+        }
+    }
+
+    g_free(check.used_clusters);
+    return ret;
+}
diff --git a/contrib/qemu/block/qed-cluster.c b/contrib/qemu/block/qed-cluster.c
new file mode 100644
index 000000000..f64b2af8f
--- /dev/null
+++ b/contrib/qemu/block/qed-cluster.c
@@ -0,0 +1,165 @@
+/*
+ * QEMU Enhanced Disk Format Cluster functions
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+/**
+ * Count the number of contiguous data clusters
+ *
+ * @s:              QED state
+ * @table:          L2 table
+ * @index:          First cluster index
+ * @n:              Maximum number of clusters
+ * @offset:         Set to first cluster offset
+ *
+ * This function scans tables for contiguous clusters.  A contiguous run of
+ * clusters may be allocated, unallocated, or zero.
+ */
+static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
+                                                  QEDTable *table,
+                                                  unsigned int index,
+                                                  unsigned int n,
+                                                  uint64_t *offset)
+{
+    unsigned int end = MIN(index + n, s->table_nelems);
+    uint64_t last = table->offsets[index];
+    unsigned int i;
+
+    *offset = last;
+
+    for (i = index + 1; i < end; i++) {
+        if (qed_offset_is_unalloc_cluster(last)) {
+            /* Counting unallocated clusters */
+            if (!qed_offset_is_unalloc_cluster(table->offsets[i])) {
+                break;
+            }
+        } else if (qed_offset_is_zero_cluster(last)) {
+            /* Counting zero clusters */
+            if (!qed_offset_is_zero_cluster(table->offsets[i])) {
+                break;
+            }
+        } else {
+            /* Counting allocated clusters */
+            if (table->offsets[i] != last + s->header.cluster_size) {
+                break;
+            }
+            last = table->offsets[i];
+        }
+    }
+    return i - index;
+}
+
+typedef struct {
+    BDRVQEDState *s;
+    uint64_t pos;
+    size_t len;
+
+    QEDRequest *request;
+
+    /* User callback */
+    QEDFindClusterFunc *cb;
+    void *opaque;
+} QEDFindClusterCB;
+
+static void qed_find_cluster_cb(void *opaque, int ret)
+{
+    QEDFindClusterCB *find_cluster_cb = opaque;
+    BDRVQEDState *s = find_cluster_cb->s;
+    QEDRequest *request = find_cluster_cb->request;
+    uint64_t offset = 0;
+    size_t len = 0;
+    unsigned int index;
+    unsigned int n;
+
+    if (ret) {
+        goto out;
+    }
+
+    index = qed_l2_index(s, find_cluster_cb->pos);
+    n = qed_bytes_to_clusters(s,
+                              qed_offset_into_cluster(s, find_cluster_cb->pos) +
+                              find_cluster_cb->len);
+    n = qed_count_contiguous_clusters(s, request->l2_table->table,
+                                      index, n, &offset);
+
+    if (qed_offset_is_unalloc_cluster(offset)) {
+        ret = QED_CLUSTER_L2;
+    } else if (qed_offset_is_zero_cluster(offset)) {
+        ret = QED_CLUSTER_ZERO;
+    } else if (qed_check_cluster_offset(s, offset)) {
+        ret = QED_CLUSTER_FOUND;
+    } else {
+        ret = -EINVAL;
+    }
+
+    len = MIN(find_cluster_cb->len, n * s->header.cluster_size -
+              qed_offset_into_cluster(s, find_cluster_cb->pos));
+
+out:
+    find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
+    g_free(find_cluster_cb);
+}
+
+/**
+ * Find the offset of a data cluster
+ *
+ * @s:          QED state
+ * @request:    L2 cache entry
+ * @pos:        Byte position in device
+ * @len:        Number of bytes
+ * @cb:         Completion function
+ * @opaque:     User data for completion function
+ *
+ * This function translates a position in the block device to an offset in the
+ * image file.  It invokes the cb completion callback to report back the
+ * translated offset or unallocated range in the image file.
+ *
+ * If the L2 table exists, request->l2_table points to the L2 table cache entry
+ * and the caller must free the reference when they are finished.  The cache
+ * entry is exposed in this way to avoid callers having to read the L2 table
+ * again later during request processing.  If request->l2_table is non-NULL it
+ * will be unreferenced before taking on the new cache entry.
+ */
+void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+                      size_t len, QEDFindClusterFunc *cb, void *opaque)
+{
+    QEDFindClusterCB *find_cluster_cb;
+    uint64_t l2_offset;
+
+    /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
+     * so that a request acts on one L2 table at a time.
+     */
+    len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
+
+    l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
+    if (qed_offset_is_unalloc_cluster(l2_offset)) {
+        cb(opaque, QED_CLUSTER_L1, 0, len);
+        return;
+    }
+    if (!qed_check_table_offset(s, l2_offset)) {
+        cb(opaque, -EINVAL, 0, 0);
+        return;
+    }
+
+    find_cluster_cb = g_malloc(sizeof(*find_cluster_cb));
+    find_cluster_cb->s = s;
+    find_cluster_cb->pos = pos;
+    find_cluster_cb->len = len;
+    find_cluster_cb->cb = cb;
+    find_cluster_cb->opaque = opaque;
+    find_cluster_cb->request = request;
+
+    qed_read_l2_table(s, request, l2_offset,
+                      qed_find_cluster_cb, find_cluster_cb);
+}
diff --git a/contrib/qemu/block/qed-gencb.c b/contrib/qemu/block/qed-gencb.c
new file mode 100644
index 000000000..7d7ac1ffc
--- /dev/null
+++ b/contrib/qemu/block/qed-gencb.c
@@ -0,0 +1,32 @@
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque)
+{
+    GenericCB *gencb = g_malloc(len);
+    gencb->cb = cb;
+    gencb->opaque = opaque;
+    return gencb;
+}
+
+void gencb_complete(void *opaque, int ret)
+{
+    GenericCB *gencb = opaque;
+    BlockDriverCompletionFunc *cb = gencb->cb;
+    void *user_opaque = gencb->opaque;
+
+    g_free(gencb);
+    cb(user_opaque, ret);
+}
diff --git a/contrib/qemu/block/qed-l2-cache.c b/contrib/qemu/block/qed-l2-cache.c
new file mode 100644
index 000000000..e9b2aae44
--- /dev/null
+++ b/contrib/qemu/block/qed-l2-cache.c
@@ -0,0 +1,187 @@
+/*
+ * QEMU Enhanced Disk Format L2 Cache
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+/*
+ * L2 table cache usage is as follows:
+ *
+ * An open image has one L2 table cache that is used to avoid accessing the
+ * image file for recently referenced L2 tables.
+ *
+ * Cluster offset lookup translates the logical offset within the block device
+ * to a cluster offset within the image file.  This is done by indexing into
+ * the L1 and L2 tables which store cluster offsets.  It is here where the L2
+ * table cache serves up recently referenced L2 tables.
+ *
+ * If there is a cache miss, that L2 table is read from the image file and
+ * committed to the cache.  Subsequent accesses to that L2 table will be served
+ * from the cache until the table is evicted from the cache.
+ *
+ * L2 tables are also committed to the cache when new L2 tables are allocated
+ * in the image file.  Since the L2 table cache is write-through, the new L2
+ * table is first written out to the image file and then committed to the
+ * cache.
+ *
+ * Multiple I/O requests may be using an L2 table cache entry at any given
+ * time.  That means an entry may be in use across several requests and
+ * reference counting is needed to free the entry at the correct time.  In
+ * particular, an entry evicted from the cache will only be freed once all
+ * references are dropped.
+ *
+ * An in-flight I/O request will hold a reference to a L2 table cache entry for
+ * the period during which it needs to access the L2 table.  This includes
+ * cluster offset lookup, L2 table allocation, and L2 table update when a new
+ * data cluster has been allocated.
+ *
+ * An interesting case occurs when two requests need to access an L2 table that
+ * is not in the cache.  Since the operation to read the table from the image
+ * file takes some time to complete, both requests may see a cache miss and
+ * start reading the L2 table from the image file.  The first to finish will
+ * commit its L2 table into the cache.  When the second tries to commit its
+ * table will be deleted in favor of the existing cache entry.
+ */
+
+#include "trace.h"
+#include "qed.h"
+
+/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */
+#define MAX_L2_CACHE_SIZE 50
+
+/**
+ * Initialize the L2 cache
+ */
+void qed_init_l2_cache(L2TableCache *l2_cache)
+{
+    QTAILQ_INIT(&l2_cache->entries);
+    l2_cache->n_entries = 0;
+}
+
+/**
+ * Free the L2 cache
+ */
+void qed_free_l2_cache(L2TableCache *l2_cache)
+{
+    CachedL2Table *entry, *next_entry;
+
+    QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) {
+        qemu_vfree(entry->table);
+        g_free(entry);
+    }
+}
+
+/**
+ * Allocate an uninitialized entry from the cache
+ *
+ * The returned entry has a reference count of 1 and is owned by the caller.
+ * The caller must allocate the actual table field for this entry and it must
+ * be freeable using qemu_vfree().
+ */
+CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache)
+{
+    CachedL2Table *entry;
+
+    entry = g_malloc0(sizeof(*entry));
+    entry->ref++;
+
+    trace_qed_alloc_l2_cache_entry(l2_cache, entry);
+
+    return entry;
+}
+
+/**
+ * Decrease an entry's reference count and free if necessary when the reference
+ * count drops to zero.
+ */
+void qed_unref_l2_cache_entry(CachedL2Table *entry)
+{
+    if (!entry) {
+        return;
+    }
+
+    entry->ref--;
+    trace_qed_unref_l2_cache_entry(entry, entry->ref);
+    if (entry->ref == 0) {
+        qemu_vfree(entry->table);
+        g_free(entry);
+    }
+}
+
+/**
+ * Find an entry in the L2 cache.  This may return NULL and it's up to the
+ * caller to satisfy the cache miss.
+ *
+ * For a cached entry, this function increases the reference count and returns
+ * the entry.
+ */
+CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset)
+{
+    CachedL2Table *entry;
+
+    QTAILQ_FOREACH(entry, &l2_cache->entries, node) {
+        if (entry->offset == offset) {
+            trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref);
+            entry->ref++;
+            return entry;
+        }
+    }
+    return NULL;
+}
+
+/**
+ * Commit an L2 cache entry into the cache.  This is meant to be used as part of
+ * the process to satisfy a cache miss.  A caller would allocate an entry which
+ * is not actually in the L2 cache and then once the entry was valid and
+ * present on disk, the entry can be committed into the cache.
+ *
+ * Since the cache is write-through, it's important that this function is not
+ * called until the entry is present on disk and the L1 has been updated to
+ * point to the entry.
+ *
+ * N.B. This function steals a reference to the l2_table from the caller so the
+ * caller must obtain a new reference by issuing a call to
+ * qed_find_l2_cache_entry().
+ */
+void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table)
+{
+    CachedL2Table *entry;
+
+    entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset);
+    if (entry) {
+        qed_unref_l2_cache_entry(entry);
+        qed_unref_l2_cache_entry(l2_table);
+        return;
+    }
+
+    /* Evict an unused cache entry so we have space.  If all entries are in use
+     * we can grow the cache temporarily and we try to shrink back down later.
+     */
+    if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) {
+        CachedL2Table *next;
+        QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next) {
+            if (entry->ref > 1) {
+                continue;
+            }
+
+            QTAILQ_REMOVE(&l2_cache->entries, entry, node);
+            l2_cache->n_entries--;
+            qed_unref_l2_cache_entry(entry);
+
+            /* Stop evicting when we've shrunk back to max size */
+            if (l2_cache->n_entries < MAX_L2_CACHE_SIZE) {
+                break;
+            }
+        }
+    }
+
+    l2_cache->n_entries++;
+    QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node);
+}
diff --git a/contrib/qemu/block/qed-table.c b/contrib/qemu/block/qed-table.c
new file mode 100644
index 000000000..76d2dcccf
--- /dev/null
+++ b/contrib/qemu/block/qed-table.c
@@ -0,0 +1,296 @@
+/*
+ * QEMU Enhanced Disk Format Table I/O
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
+#include "qed.h"
+
+typedef struct {
+    GenericCB gencb;
+    BDRVQEDState *s;
+    QEDTable *table;
+
+    struct iovec iov;
+    QEMUIOVector qiov;
+} QEDReadTableCB;
+
+static void qed_read_table_cb(void *opaque, int ret)
+{
+    QEDReadTableCB *read_table_cb = opaque;
+    QEDTable *table = read_table_cb->table;
+    int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
+    int i;
+
+    /* Handle I/O error */
+    if (ret) {
+        goto out;
+    }
+
+    /* Byteswap offsets */
+    for (i = 0; i < noffsets; i++) {
+        table->offsets[i] = le64_to_cpu(table->offsets[i]);
+    }
+
+out:
+    /* Completion */
+    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
+    gencb_complete(&read_table_cb->gencb, ret);
+}
+
+static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+                           BlockDriverCompletionFunc *cb, void *opaque)
+{
+    QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb),
+                                                cb, opaque);
+    QEMUIOVector *qiov = &read_table_cb->qiov;
+
+    trace_qed_read_table(s, offset, table);
+
+    read_table_cb->s = s;
+    read_table_cb->table = table;
+    read_table_cb->iov.iov_base = table->offsets,
+    read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size,
+
+    qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
+    bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
+                   qiov->size / BDRV_SECTOR_SIZE,
+                   qed_read_table_cb, read_table_cb);
+}
+
+typedef struct {
+    GenericCB gencb;
+    BDRVQEDState *s;
+    QEDTable *orig_table;
+    QEDTable *table;
+    bool flush;             /* flush after write? */
+
+    struct iovec iov;
+    QEMUIOVector qiov;
+} QEDWriteTableCB;
+
+static void qed_write_table_cb(void *opaque, int ret)
+{
+    QEDWriteTableCB *write_table_cb = opaque;
+
+    trace_qed_write_table_cb(write_table_cb->s,
+                             write_table_cb->orig_table,
+                             write_table_cb->flush,
+                             ret);
+
+    if (ret) {
+        goto out;
+    }
+
+    if (write_table_cb->flush) {
+        /* We still need to flush first */
+        write_table_cb->flush = false;
+        bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
+                       write_table_cb);
+        return;
+    }
+
+out:
+    qemu_vfree(write_table_cb->table);
+    gencb_complete(&write_table_cb->gencb, ret);
+}
+
+/**
+ * Write out an updated part or all of a table
+ *
+ * @s:          QED state
+ * @offset:     Offset of table in image file, in bytes
+ * @table:      Table
+ * @index:      Index of first element
+ * @n:          Number of elements
+ * @flush:      Whether or not to sync to disk
+ * @cb:         Completion function
+ * @opaque:     Argument for completion function
+ */
+static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+                            unsigned int index, unsigned int n, bool flush,
+                            BlockDriverCompletionFunc *cb, void *opaque)
+{
+    QEDWriteTableCB *write_table_cb;
+    unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
+    unsigned int start, end, i;
+    size_t len_bytes;
+
+    trace_qed_write_table(s, offset, table, index, n);
+
+    /* Calculate indices of the first and one after last elements */
+    start = index & ~sector_mask;
+    end = (index + n + sector_mask) & ~sector_mask;
+
+    len_bytes = (end - start) * sizeof(uint64_t);
+
+    write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque);
+    write_table_cb->s = s;
+    write_table_cb->orig_table = table;
+    write_table_cb->flush = flush;
+    write_table_cb->table = qemu_blockalign(s->bs, len_bytes);
+    write_table_cb->iov.iov_base = write_table_cb->table->offsets;
+    write_table_cb->iov.iov_len = len_bytes;
+    qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1);
+
+    /* Byteswap table */
+    for (i = start; i < end; i++) {
+        uint64_t le_offset = cpu_to_le64(table->offsets[i]);
+        write_table_cb->table->offsets[i - start] = le_offset;
+    }
+
+    /* Adjust for offset into table */
+    offset += start * sizeof(uint64_t);
+
+    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
+                    &write_table_cb->qiov,
+                    write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
+                    qed_write_table_cb, write_table_cb);
+}
+
+/**
+ * Propagate return value from async callback
+ */
+static void qed_sync_cb(void *opaque, int ret)
+{
+    *(int *)opaque = ret;
+}
+
+int qed_read_l1_table_sync(BDRVQEDState *s)
+{
+    int ret = -EINPROGRESS;
+
+    qed_read_table(s, s->header.l1_table_offset,
+                   s->l1_table, qed_sync_cb, &ret);
+    while (ret == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+
+    return ret;
+}
+
+void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
+                        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
+    qed_write_table(s, s->header.l1_table_offset,
+                    s->l1_table, index, n, false, cb, opaque);
+}
+
+int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
+                            unsigned int n)
+{
+    int ret = -EINPROGRESS;
+
+    qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
+    while (ret == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+
+    return ret;
+}
+
+typedef struct {
+    GenericCB gencb;
+    BDRVQEDState *s;
+    uint64_t l2_offset;
+    QEDRequest *request;
+} QEDReadL2TableCB;
+
+static void qed_read_l2_table_cb(void *opaque, int ret)
+{
+    QEDReadL2TableCB *read_l2_table_cb = opaque;
+    QEDRequest *request = read_l2_table_cb->request;
+    BDRVQEDState *s = read_l2_table_cb->s;
+    CachedL2Table *l2_table = request->l2_table;
+    uint64_t l2_offset = read_l2_table_cb->l2_offset;
+
+    if (ret) {
+        /* can't trust loaded L2 table anymore */
+        qed_unref_l2_cache_entry(l2_table);
+        request->l2_table = NULL;
+    } else {
+        l2_table->offset = l2_offset;
+
+        qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
+
+        /* This is guaranteed to succeed because we just committed the entry
+         * to the cache.
+         */
+        request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+        assert(request->l2_table != NULL);
+    }
+
+    gencb_complete(&read_l2_table_cb->gencb, ret);
+}
+
+void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
+                       BlockDriverCompletionFunc *cb, void *opaque)
+{
+    QEDReadL2TableCB *read_l2_table_cb;
+
+    qed_unref_l2_cache_entry(request->l2_table);
+
+    /* Check for cached L2 entry */
+    request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
+    if (request->l2_table) {
+        cb(opaque, 0);
+        return;
+    }
+
+    request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
+    request->l2_table->table = qed_alloc_table(s);
+
+    read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque);
+    read_l2_table_cb->s = s;
+    read_l2_table_cb->l2_offset = offset;
+    read_l2_table_cb->request = request;
+
+    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD);
+    qed_read_table(s, offset, request->l2_table->table,
+                   qed_read_l2_table_cb, read_l2_table_cb);
+}
+
+int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
+{
+    int ret = -EINPROGRESS;
+
+    qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
+    while (ret == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+
+    return ret;
+}
+
+void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+                        unsigned int index, unsigned int n, bool flush,
+                        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
+    qed_write_table(s, request->l2_table->offset,
+                    request->l2_table->table, index, n, flush, cb, opaque);
+}
+
+int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
+                            unsigned int index, unsigned int n, bool flush)
+{
+    int ret = -EINPROGRESS;
+
+    qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
+    while (ret == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+
+    return ret;
+}
diff --git a/contrib/qemu/block/qed.c b/contrib/qemu/block/qed.c
new file mode 100644
index 000000000..f767b0528
--- /dev/null
+++ b/contrib/qemu/block/qed.c
@@ -0,0 +1,1596 @@
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/timer.h"
+#include "trace.h"
+#include "qed.h"
+#include "qapi/qmp/qerror.h"
+#include "migration/migration.h"
+
+static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    QEDAIOCB *acb = (QEDAIOCB *)blockacb;
+    bool finished = false;
+
+    /* Wait for the request to finish */
+    acb->finished = &finished;
+    while (!finished) {
+        qemu_aio_wait();
+    }
+}
+
+static const AIOCBInfo qed_aiocb_info = {
+    .aiocb_size         = sizeof(QEDAIOCB),
+    .cancel             = qed_aio_cancel,
+};
+
+static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
+                          const char *filename)
+{
+    const QEDHeader *header = (const QEDHeader *)buf;
+
+    if (buf_size < sizeof(*header)) {
+        return 0;
+    }
+    if (le32_to_cpu(header->magic) != QED_MAGIC) {
+        return 0;
+    }
+    return 100;
+}
+
+/**
+ * Check whether an image format is raw
+ *
+ * @fmt:    Backing file format, may be NULL
+ */
+static bool qed_fmt_is_raw(const char *fmt)
+{
+    return fmt && strcmp(fmt, "raw") == 0;
+}
+
+static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
+{
+    cpu->magic = le32_to_cpu(le->magic);
+    cpu->cluster_size = le32_to_cpu(le->cluster_size);
+    cpu->table_size = le32_to_cpu(le->table_size);
+    cpu->header_size = le32_to_cpu(le->header_size);
+    cpu->features = le64_to_cpu(le->features);
+    cpu->compat_features = le64_to_cpu(le->compat_features);
+    cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
+    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
+    cpu->image_size = le64_to_cpu(le->image_size);
+    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
+    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
+}
+
+static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
+{
+    le->magic = cpu_to_le32(cpu->magic);
+    le->cluster_size = cpu_to_le32(cpu->cluster_size);
+    le->table_size = cpu_to_le32(cpu->table_size);
+    le->header_size = cpu_to_le32(cpu->header_size);
+    le->features = cpu_to_le64(cpu->features);
+    le->compat_features = cpu_to_le64(cpu->compat_features);
+    le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
+    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
+    le->image_size = cpu_to_le64(cpu->image_size);
+    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
+    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
+}
+
+int qed_write_header_sync(BDRVQEDState *s)
+{
+    QEDHeader le;
+    int ret;
+
+    qed_header_cpu_to_le(&s->header, &le);
+    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
+    if (ret != sizeof(le)) {
+        return ret;
+    }
+    return 0;
+}
+
+typedef struct {
+    GenericCB gencb;
+    BDRVQEDState *s;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    int nsectors;
+    uint8_t *buf;
+} QEDWriteHeaderCB;
+
+static void qed_write_header_cb(void *opaque, int ret)
+{
+    QEDWriteHeaderCB *write_header_cb = opaque;
+
+    qemu_vfree(write_header_cb->buf);
+    gencb_complete(write_header_cb, ret);
+}
+
+static void qed_write_header_read_cb(void *opaque, int ret)
+{
+    QEDWriteHeaderCB *write_header_cb = opaque;
+    BDRVQEDState *s = write_header_cb->s;
+
+    if (ret) {
+        qed_write_header_cb(write_header_cb, ret);
+        return;
+    }
+
+    /* Update header */
+    qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
+
+    bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
+                    write_header_cb->nsectors, qed_write_header_cb,
+                    write_header_cb);
+}
+
+/**
+ * Update header in-place (does not rewrite backing filename or other strings)
+ *
+ * This function only updates known header fields in-place and does not affect
+ * extra data after the QED header.
+ */
+static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb,
+                             void *opaque)
+{
+    /* We must write full sectors for O_DIRECT but cannot necessarily generate
+     * the data following the header if an unrecognized compat feature is
+     * active.  Therefore, first read the sectors containing the header, update
+     * them, and write back.
+     */
+
+    int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
+                   BDRV_SECTOR_SIZE;
+    size_t len = nsectors * BDRV_SECTOR_SIZE;
+    QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
+                                                    cb, opaque);
+
+    write_header_cb->s = s;
+    write_header_cb->nsectors = nsectors;
+    write_header_cb->buf = qemu_blockalign(s->bs, len);
+    write_header_cb->iov.iov_base = write_header_cb->buf;
+    write_header_cb->iov.iov_len = len;
+    qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
+
+    bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
+                   qed_write_header_read_cb, write_header_cb);
+}
+
+static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
+{
+    uint64_t table_entries;
+    uint64_t l2_size;
+
+    table_entries = (table_size * cluster_size) / sizeof(uint64_t);
+    l2_size = table_entries * cluster_size;
+
+    return l2_size * table_entries;
+}
+
+static bool qed_is_cluster_size_valid(uint32_t cluster_size)
+{
+    if (cluster_size < QED_MIN_CLUSTER_SIZE ||
+        cluster_size > QED_MAX_CLUSTER_SIZE) {
+        return false;
+    }
+    if (cluster_size & (cluster_size - 1)) {
+        return false; /* not power of 2 */
+    }
+    return true;
+}
+
+static bool qed_is_table_size_valid(uint32_t table_size)
+{
+    if (table_size < QED_MIN_TABLE_SIZE ||
+        table_size > QED_MAX_TABLE_SIZE) {
+        return false;
+    }
+    if (table_size & (table_size - 1)) {
+        return false; /* not power of 2 */
+    }
+    return true;
+}
+
+static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
+                                    uint32_t table_size)
+{
+    if (image_size % BDRV_SECTOR_SIZE != 0) {
+        return false; /* not multiple of sector size */
+    }
+    if (image_size > qed_max_image_size(cluster_size, table_size)) {
+        return false; /* image is too large */
+    }
+    return true;
+}
+
+/**
+ * Read a string of known length from the image file
+ *
+ * @file:       Image file
+ * @offset:     File offset to start of string, in bytes
+ * @n:          String length in bytes
+ * @buf:        Destination buffer
+ * @buflen:     Destination buffer length in bytes
+ * @ret:        0 on success, -errno on failure
+ *
+ * The string is NUL-terminated.
+ */
+static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
+                           char *buf, size_t buflen)
+{
+    int ret;
+    if (n >= buflen) {
+        return -EINVAL;
+    }
+    ret = bdrv_pread(file, offset, buf, n);
+    if (ret < 0) {
+        return ret;
+    }
+    buf[n] = '\0';
+    return 0;
+}
+
+/**
+ * Allocate new clusters
+ *
+ * @s:          QED state
+ * @n:          Number of contiguous clusters to allocate
+ * @ret:        Offset of first allocated cluster
+ *
+ * This function only produces the offset where the new clusters should be
+ * written.  It updates BDRVQEDState but does not make any changes to the image
+ * file.
+ */
+static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
+{
+    uint64_t offset = s->file_size;
+    s->file_size += n * s->header.cluster_size;
+    return offset;
+}
+
+QEDTable *qed_alloc_table(BDRVQEDState *s)
+{
+    /* Honor O_DIRECT memory alignment requirements */
+    return qemu_blockalign(s->bs,
+                           s->header.cluster_size * s->header.table_size);
+}
+
+/**
+ * Allocate a new zeroed L2 table
+ */
+static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
+{
+    CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
+
+    l2_table->table = qed_alloc_table(s);
+    l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
+
+    memset(l2_table->table->offsets, 0,
+           s->header.cluster_size * s->header.table_size);
+    return l2_table;
+}
+
+static void qed_aio_next_io(void *opaque, int ret);
+
+static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
+{
+    assert(!s->allocating_write_reqs_plugged);
+
+    s->allocating_write_reqs_plugged = true;
+}
+
+static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
+{
+    QEDAIOCB *acb;
+
+    assert(s->allocating_write_reqs_plugged);
+
+    s->allocating_write_reqs_plugged = false;
+
+    acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+    if (acb) {
+        qed_aio_next_io(acb, 0);
+    }
+}
+
+static void qed_finish_clear_need_check(void *opaque, int ret)
+{
+    /* Do nothing */
+}
+
+static void qed_flush_after_clear_need_check(void *opaque, int ret)
+{
+    BDRVQEDState *s = opaque;
+
+    bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
+
+    /* No need to wait until flush completes */
+    qed_unplug_allocating_write_reqs(s);
+}
+
+static void qed_clear_need_check(void *opaque, int ret)
+{
+    BDRVQEDState *s = opaque;
+
+    if (ret) {
+        qed_unplug_allocating_write_reqs(s);
+        return;
+    }
+
+    s->header.features &= ~QED_F_NEED_CHECK;
+    qed_write_header(s, qed_flush_after_clear_need_check, s);
+}
+
+static void qed_need_check_timer_cb(void *opaque)
+{
+    BDRVQEDState *s = opaque;
+
+    /* The timer should only fire when allocating writes have drained */
+    assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
+
+    trace_qed_need_check_timer_cb(s);
+
+    qed_plug_allocating_write_reqs(s);
+
+    /* Ensure writes are on disk before clearing flag */
+    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
+}
+
+static void qed_start_need_check_timer(BDRVQEDState *s)
+{
+    trace_qed_start_need_check_timer(s);
+
+    /* Use vm_clock so we don't alter the image file while suspended for
+     * migration.
+     */
+    qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) +
+                   get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT);
+}
+
+/* It's okay to call this multiple times or when no timer is started */
+static void qed_cancel_need_check_timer(BDRVQEDState *s)
+{
+    trace_qed_cancel_need_check_timer(s);
+    qemu_del_timer(s->need_check_timer);
+}
+
+static void bdrv_qed_rebind(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+    s->bs = bs;
+}
+
+static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVQEDState *s = bs->opaque;
+    QEDHeader le_header;
+    int64_t file_size;
+    int ret;
+
+    s->bs = bs;
+    QSIMPLEQ_INIT(&s->allocating_write_reqs);
+
+    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
+    if (ret < 0) {
+        return ret;
+    }
+    qed_header_le_to_cpu(&le_header, &s->header);
+
+    if (s->header.magic != QED_MAGIC) {
+        return -EMEDIUMTYPE;
+    }
+    if (s->header.features & ~QED_FEATURE_MASK) {
+        /* image uses unsupported feature bits */
+        char buf[64];
+        snprintf(buf, sizeof(buf), "%" PRIx64,
+            s->header.features & ~QED_FEATURE_MASK);
+        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+            bs->device_name, "QED", buf);
+        return -ENOTSUP;
+    }
+    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
+        return -EINVAL;
+    }
+
+    /* Round down file size to the last cluster */
+    file_size = bdrv_getlength(bs->file);
+    if (file_size < 0) {
+        return file_size;
+    }
+    s->file_size = qed_start_of_cluster(s, file_size);
+
+    if (!qed_is_table_size_valid(s->header.table_size)) {
+        return -EINVAL;
+    }
+    if (!qed_is_image_size_valid(s->header.image_size,
+                                 s->header.cluster_size,
+                                 s->header.table_size)) {
+        return -EINVAL;
+    }
+    if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
+        return -EINVAL;
+    }
+
+    s->table_nelems = (s->header.cluster_size * s->header.table_size) /
+                      sizeof(uint64_t);
+    s->l2_shift = ffs(s->header.cluster_size) - 1;
+    s->l2_mask = s->table_nelems - 1;
+    s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
+
+    if ((s->header.features & QED_F_BACKING_FILE)) {
+        if ((uint64_t)s->header.backing_filename_offset +
+            s->header.backing_filename_size >
+            s->header.cluster_size * s->header.header_size) {
+            return -EINVAL;
+        }
+
+        ret = qed_read_string(bs->file, s->header.backing_filename_offset,
+                              s->header.backing_filename_size, bs->backing_file,
+                              sizeof(bs->backing_file));
+        if (ret < 0) {
+            return ret;
+        }
+
+        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
+            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
+        }
+    }
+
+    /* Reset unknown autoclear feature bits.  This is a backwards
+     * compatibility mechanism that allows images to be opened by older
+     * programs, which "knock out" unknown feature bits.  When an image is
+     * opened by a newer program again it can detect that the autoclear
+     * feature is no longer valid.
+     */
+    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
+        !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) {
+        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
+
+        ret = qed_write_header_sync(s);
+        if (ret) {
+            return ret;
+        }
+
+        /* From here on only known autoclear feature bits are valid */
+        bdrv_flush(bs->file);
+    }
+
+    s->l1_table = qed_alloc_table(s);
+    qed_init_l2_cache(&s->l2_cache);
+
+    ret = qed_read_l1_table_sync(s);
+    if (ret) {
+        goto out;
+    }
+
+    /* If image was not closed cleanly, check consistency */
+    if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
+        /* Read-only images cannot be fixed.  There is no risk of corruption
+         * since write operations are not possible.  Therefore, allow
+         * potentially inconsistent images to be opened read-only.  This can
+         * aid data recovery from an otherwise inconsistent image.
+         */
+        if (!bdrv_is_read_only(bs->file) &&
+            !(flags & BDRV_O_INCOMING)) {
+            BdrvCheckResult result = {0};
+
+            ret = qed_check(s, &result, true);
+            if (ret) {
+                goto out;
+            }
+        }
+    }
+
+    s->need_check_timer = qemu_new_timer_ns(vm_clock,
+                                            qed_need_check_timer_cb, s);
+
+out:
+    if (ret) {
+        qed_free_l2_cache(&s->l2_cache);
+        qemu_vfree(s->l1_table);
+    }
+    return ret;
+}
+
+/* We have nothing to do for QED reopen, stubs just return
+ * success */
+static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
+                                   BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
+static void bdrv_qed_close(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    qed_cancel_need_check_timer(s);
+    qemu_free_timer(s->need_check_timer);
+
+    /* Ensure writes reach stable storage */
+    bdrv_flush(bs->file);
+
+    /* Clean shutdown, no check required on next open */
+    if (s->header.features & QED_F_NEED_CHECK) {
+        s->header.features &= ~QED_F_NEED_CHECK;
+        qed_write_header_sync(s);
+    }
+
+    qed_free_l2_cache(&s->l2_cache);
+    qemu_vfree(s->l1_table);
+}
+
+static int qed_create(const char *filename, uint32_t cluster_size,
+                      uint64_t image_size, uint32_t table_size,
+                      const char *backing_file, const char *backing_fmt)
+{
+    QEDHeader header = {
+        .magic = QED_MAGIC,
+        .cluster_size = cluster_size,
+        .table_size = table_size,
+        .header_size = 1,
+        .features = 0,
+        .compat_features = 0,
+        .l1_table_offset = cluster_size,
+        .image_size = image_size,
+    };
+    QEDHeader le_header;
+    uint8_t *l1_table = NULL;
+    size_t l1_size = header.cluster_size * header.table_size;
+    int ret = 0;
+    BlockDriverState *bs = NULL;
+
+    ret = bdrv_create_file(filename, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* File must start empty and grow, check truncate is supported */
+    ret = bdrv_truncate(bs, 0);
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (backing_file) {
+        header.features |= QED_F_BACKING_FILE;
+        header.backing_filename_offset = sizeof(le_header);
+        header.backing_filename_size = strlen(backing_file);
+
+        if (qed_fmt_is_raw(backing_fmt)) {
+            header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
+        }
+    }
+
+    qed_header_cpu_to_le(&header, &le_header);
+    ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header));
+    if (ret < 0) {
+        goto out;
+    }
+    ret = bdrv_pwrite(bs, sizeof(le_header), backing_file,
+                      header.backing_filename_size);
+    if (ret < 0) {
+        goto out;
+    }
+
+    l1_table = g_malloc0(l1_size);
+    ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = 0; /* success */
+out:
+    g_free(l1_table);
+    bdrv_delete(bs);
+    return ret;
+}
+
+static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
+{
+    uint64_t image_size = 0;
+    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
+    uint32_t table_size = QED_DEFAULT_TABLE_SIZE;
+    const char *backing_file = NULL;
+    const char *backing_fmt = NULL;
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            image_size = options->value.n;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_file = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+            backing_fmt = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+            if (options->value.n) {
+                cluster_size = options->value.n;
+            }
+        } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) {
+            if (options->value.n) {
+                table_size = options->value.n;
+            }
+        }
+        options++;
+    }
+
+    if (!qed_is_cluster_size_valid(cluster_size)) {
+        fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n",
+                QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
+        return -EINVAL;
+    }
+    if (!qed_is_table_size_valid(table_size)) {
+        fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n",
+                QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
+        return -EINVAL;
+    }
+    if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) {
+        fprintf(stderr, "QED image size must be a non-zero multiple of "
+                        "cluster size and less than %" PRIu64 " bytes\n",
+                qed_max_image_size(cluster_size, table_size));
+        return -EINVAL;
+    }
+
+    return qed_create(filename, cluster_size, image_size, table_size,
+                      backing_file, backing_fmt);
+}
+
+typedef struct {
+    Coroutine *co;
+    int is_allocated;
+    int *pnum;
+} QEDIsAllocatedCB;
+
+static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
+{
+    QEDIsAllocatedCB *cb = opaque;
+    *cb->pnum = len / BDRV_SECTOR_SIZE;
+    cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO);
+    if (cb->co) {
+        qemu_coroutine_enter(cb->co, NULL);
+    }
+}
+
+static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs,
+                                                 int64_t sector_num,
+                                                 int nb_sectors, int *pnum)
+{
+    BDRVQEDState *s = bs->opaque;
+    uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
+    size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
+    QEDIsAllocatedCB cb = {
+        .is_allocated = -1,
+        .pnum = pnum,
+    };
+    QEDRequest request = { .l2_table = NULL };
+
+    qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
+
+    /* Now sleep if the callback wasn't invoked immediately */
+    while (cb.is_allocated == -1) {
+        cb.co = qemu_coroutine_self();
+        qemu_coroutine_yield();
+    }
+
+    qed_unref_l2_cache_entry(request.l2_table);
+
+    return cb.is_allocated;
+}
+
+static int bdrv_qed_make_empty(BlockDriverState *bs)
+{
+    return -ENOTSUP;
+}
+
+static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
+{
+    return acb->common.bs->opaque;
+}
+
+/**
+ * Read from the backing file or zero-fill if no backing file
+ *
+ * @s:          QED state
+ * @pos:        Byte position in device
+ * @qiov:       Destination I/O vector
+ * @cb:         Completion function
+ * @opaque:     User data for completion function
+ *
+ * This function reads qiov->size bytes starting at pos from the backing file.
+ * If there is no backing file then zeroes are read.
+ */
+static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
+                                  QEMUIOVector *qiov,
+                                  BlockDriverCompletionFunc *cb, void *opaque)
+{
+    uint64_t backing_length = 0;
+    size_t size;
+
+    /* If there is a backing file, get its length.  Treat the absence of a
+     * backing file like a zero length backing file.
+     */
+    if (s->bs->backing_hd) {
+        int64_t l = bdrv_getlength(s->bs->backing_hd);
+        if (l < 0) {
+            cb(opaque, l);
+            return;
+        }
+        backing_length = l;
+    }
+
+    /* Zero all sectors if reading beyond the end of the backing file */
+    if (pos >= backing_length ||
+        pos + qiov->size > backing_length) {
+        qemu_iovec_memset(qiov, 0, 0, qiov->size);
+    }
+
+    /* Complete now if there are no backing file sectors to read */
+    if (pos >= backing_length) {
+        cb(opaque, 0);
+        return;
+    }
+
+    /* If the read straddles the end of the backing file, shorten it */
+    size = MIN((uint64_t)backing_length - pos, qiov->size);
+
+    BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
+    bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE,
+                   qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
+}
+
+typedef struct {
+    GenericCB gencb;
+    BDRVQEDState *s;
+    QEMUIOVector qiov;
+    struct iovec iov;
+    uint64_t offset;
+} CopyFromBackingFileCB;
+
+static void qed_copy_from_backing_file_cb(void *opaque, int ret)
+{
+    CopyFromBackingFileCB *copy_cb = opaque;
+    qemu_vfree(copy_cb->iov.iov_base);
+    gencb_complete(&copy_cb->gencb, ret);
+}
+
+static void qed_copy_from_backing_file_write(void *opaque, int ret)
+{
+    CopyFromBackingFileCB *copy_cb = opaque;
+    BDRVQEDState *s = copy_cb->s;
+
+    if (ret) {
+        qed_copy_from_backing_file_cb(copy_cb, ret);
+        return;
+    }
+
+    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
+    bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
+                    &copy_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
+                    qed_copy_from_backing_file_cb, copy_cb);
+}
+
+/**
+ * Copy data from backing file into the image
+ *
+ * @s:          QED state
+ * @pos:        Byte position in device
+ * @len:        Number of bytes
+ * @offset:     Byte offset in image file
+ * @cb:         Completion function
+ * @opaque:     User data for completion function
+ */
+static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
+                                       uint64_t len, uint64_t offset,
+                                       BlockDriverCompletionFunc *cb,
+                                       void *opaque)
+{
+    CopyFromBackingFileCB *copy_cb;
+
+    /* Skip copy entirely if there is no work to do */
+    if (len == 0) {
+        cb(opaque, 0);
+        return;
+    }
+
+    copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque);
+    copy_cb->s = s;
+    copy_cb->offset = offset;
+    copy_cb->iov.iov_base = qemu_blockalign(s->bs, len);
+    copy_cb->iov.iov_len = len;
+    qemu_iovec_init_external(&copy_cb->qiov, &copy_cb->iov, 1);
+
+    qed_read_backing_file(s, pos, &copy_cb->qiov,
+                          qed_copy_from_backing_file_write, copy_cb);
+}
+
+/**
+ * Link one or more contiguous clusters into a table
+ *
+ * @s:              QED state
+ * @table:          L2 table
+ * @index:          First cluster index
+ * @n:              Number of contiguous clusters
+ * @cluster:        First cluster offset
+ *
+ * The cluster offset may be an allocated byte offset in the image file, the
+ * zero cluster marker, or the unallocated cluster marker.
+ */
+static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
+                                unsigned int n, uint64_t cluster)
+{
+    int i;
+    for (i = index; i < index + n; i++) {
+        table->offsets[i] = cluster;
+        if (!qed_offset_is_unalloc_cluster(cluster) &&
+            !qed_offset_is_zero_cluster(cluster)) {
+            cluster += s->header.cluster_size;
+        }
+    }
+}
+
+static void qed_aio_complete_bh(void *opaque)
+{
+    QEDAIOCB *acb = opaque;
+    BlockDriverCompletionFunc *cb = acb->common.cb;
+    void *user_opaque = acb->common.opaque;
+    int ret = acb->bh_ret;
+    bool *finished = acb->finished;
+
+    qemu_bh_delete(acb->bh);
+    qemu_aio_release(acb);
+
+    /* Invoke callback */
+    cb(user_opaque, ret);
+
+    /* Signal cancel completion */
+    if (finished) {
+        *finished = true;
+    }
+}
+
+static void qed_aio_complete(QEDAIOCB *acb, int ret)
+{
+    BDRVQEDState *s = acb_to_s(acb);
+
+    trace_qed_aio_complete(s, acb, ret);
+
+    /* Free resources */
+    qemu_iovec_destroy(&acb->cur_qiov);
+    qed_unref_l2_cache_entry(acb->request.l2_table);
+
+    /* Free the buffer we may have allocated for zero writes */
+    if (acb->flags & QED_AIOCB_ZERO) {
+        qemu_vfree(acb->qiov->iov[0].iov_base);
+        acb->qiov->iov[0].iov_base = NULL;
+    }
+
+    /* Arrange for a bh to invoke the completion function */
+    acb->bh_ret = ret;
+    acb->bh = qemu_bh_new(qed_aio_complete_bh, acb);
+    qemu_bh_schedule(acb->bh);
+
+    /* Start next allocating write request waiting behind this one.  Note that
+     * requests enqueue themselves when they first hit an unallocated cluster
+     * but they wait until the entire request is finished before waking up the
+     * next request in the queue.  This ensures that we don't cycle through
+     * requests multiple times but rather finish one at a time completely.
+     */
+    if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
+        QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
+        acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+        if (acb) {
+            qed_aio_next_io(acb, 0);
+        } else if (s->header.features & QED_F_NEED_CHECK) {
+            qed_start_need_check_timer(s);
+        }
+    }
+}
+
+/**
+ * Commit the current L2 table to the cache
+ */
+static void qed_commit_l2_update(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    CachedL2Table *l2_table = acb->request.l2_table;
+    uint64_t l2_offset = l2_table->offset;
+
+    qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
+
+    /* This is guaranteed to succeed because we just committed the entry to the
+     * cache.
+     */
+    acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+    assert(acb->request.l2_table != NULL);
+
+    qed_aio_next_io(opaque, ret);
+}
+
+/**
+ * Update L1 table with new L2 table offset and write it out
+ */
+static void qed_aio_write_l1_update(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    int index;
+
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    index = qed_l1_index(s, acb->cur_pos);
+    s->l1_table->offsets[index] = acb->request.l2_table->offset;
+
+    qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
+}
+
+/**
+ * Update L2 table with new cluster offsets and write them out
+ */
+static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
+{
+    BDRVQEDState *s = acb_to_s(acb);
+    bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
+    int index;
+
+    if (ret) {
+        goto err;
+    }
+
+    if (need_alloc) {
+        qed_unref_l2_cache_entry(acb->request.l2_table);
+        acb->request.l2_table = qed_new_l2_table(s);
+    }
+
+    index = qed_l2_index(s, acb->cur_pos);
+    qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
+                         offset);
+
+    if (need_alloc) {
+        /* Write out the whole new L2 table */
+        qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
+                            qed_aio_write_l1_update, acb);
+    } else {
+        /* Write out only the updated part of the L2 table */
+        qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
+                            qed_aio_next_io, acb);
+    }
+    return;
+
+err:
+    qed_aio_complete(acb, ret);
+}
+
+static void qed_aio_write_l2_update_cb(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+}
+
+/**
+ * Flush new data clusters before updating the L2 table
+ *
+ * This flush is necessary when a backing file is in use.  A crash during an
+ * allocating write could result in empty clusters in the image.  If the write
+ * only touched a subregion of the cluster, then backing image sectors have
+ * been lost in the untouched region.  The solution is to flush after writing a
+ * new data cluster and before updating the L2 table.
+ */
+static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+
+    if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) {
+        qed_aio_complete(acb, -EIO);
+    }
+}
+
+/**
+ * Write data to the image file
+ */
+static void qed_aio_write_main(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    uint64_t offset = acb->cur_cluster +
+                      qed_offset_into_cluster(s, acb->cur_pos);
+    BlockDriverCompletionFunc *next_fn;
+
+    trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
+
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
+        next_fn = qed_aio_next_io;
+    } else {
+        if (s->bs->backing_hd) {
+            next_fn = qed_aio_write_flush_before_l2_update;
+        } else {
+            next_fn = qed_aio_write_l2_update_cb;
+        }
+    }
+
+    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
+    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
+                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+                    next_fn, acb);
+}
+
+/**
+ * Populate back untouched region of new data cluster
+ */
+static void qed_aio_write_postfill(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    uint64_t start = acb->cur_pos + acb->cur_qiov.size;
+    uint64_t len =
+        qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
+    uint64_t offset = acb->cur_cluster +
+                      qed_offset_into_cluster(s, acb->cur_pos) +
+                      acb->cur_qiov.size;
+
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    trace_qed_aio_write_postfill(s, acb, start, len, offset);
+    qed_copy_from_backing_file(s, start, len, offset,
+                                qed_aio_write_main, acb);
+}
+
+/**
+ * Populate front untouched region of new data cluster
+ */
+static void qed_aio_write_prefill(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
+    uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
+
+    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
+    qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
+                                qed_aio_write_postfill, acb);
+}
+
+/**
+ * Check if the QED_F_NEED_CHECK bit should be set during allocating write
+ */
+static bool qed_should_set_need_check(BDRVQEDState *s)
+{
+    /* The flush before L2 update path ensures consistency */
+    if (s->bs->backing_hd) {
+        return false;
+    }
+
+    return !(s->header.features & QED_F_NEED_CHECK);
+}
+
+static void qed_aio_write_zero_cluster(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    qed_aio_write_l2_update(acb, 0, 1);
+}
+
+/**
+ * Write new data cluster
+ *
+ * @acb:        Write request
+ * @len:        Length in bytes
+ *
+ * This path is taken when writing to previously unallocated clusters.
+ */
+static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+{
+    BDRVQEDState *s = acb_to_s(acb);
+    BlockDriverCompletionFunc *cb;
+
+    /* Cancel timer when the first allocating request comes in */
+    if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
+        qed_cancel_need_check_timer(s);
+    }
+
+    /* Freeze this request if another allocating write is in progress */
+    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
+        QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
+    }
+    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
+        s->allocating_write_reqs_plugged) {
+        return; /* wait for existing request to finish */
+    }
+
+    acb->cur_nclusters = qed_bytes_to_clusters(s,
+            qed_offset_into_cluster(s, acb->cur_pos) + len);
+    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+    if (acb->flags & QED_AIOCB_ZERO) {
+        /* Skip ahead if the clusters are already zero */
+        if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
+            qed_aio_next_io(acb, 0);
+            return;
+        }
+
+        cb = qed_aio_write_zero_cluster;
+    } else {
+        cb = qed_aio_write_prefill;
+        acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
+    }
+
+    if (qed_should_set_need_check(s)) {
+        s->header.features |= QED_F_NEED_CHECK;
+        qed_write_header(s, cb, acb);
+    } else {
+        cb(acb, 0);
+    }
+}
+
+/**
+ * Write data cluster in place
+ *
+ * @acb:        Write request
+ * @offset:     Cluster offset in bytes
+ * @len:        Length in bytes
+ *
+ * This path is taken when writing to already allocated clusters.
+ */
+static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
+{
+    /* Allocate buffer for zero writes */
+    if (acb->flags & QED_AIOCB_ZERO) {
+        struct iovec *iov = acb->qiov->iov;
+
+        if (!iov->iov_base) {
+            iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len);
+            memset(iov->iov_base, 0, iov->iov_len);
+        }
+    }
+
+    /* Calculate the I/O vector */
+    acb->cur_cluster = offset;
+    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+    /* Do the actual write */
+    qed_aio_write_main(acb, 0);
+}
+
+/**
+ * Write data cluster
+ *
+ * @opaque:     Write request
+ * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
+ *              or -errno
+ * @offset:     Cluster offset in bytes
+ * @len:        Length in bytes
+ *
+ * Callback from qed_find_cluster().
+ */
+static void qed_aio_write_data(void *opaque, int ret,
+                               uint64_t offset, size_t len)
+{
+    QEDAIOCB *acb = opaque;
+
+    trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
+
+    acb->find_cluster_ret = ret;
+
+    switch (ret) {
+    case QED_CLUSTER_FOUND:
+        qed_aio_write_inplace(acb, offset, len);
+        break;
+
+    case QED_CLUSTER_L2:
+    case QED_CLUSTER_L1:
+    case QED_CLUSTER_ZERO:
+        qed_aio_write_alloc(acb, len);
+        break;
+
+    default:
+        qed_aio_complete(acb, ret);
+        break;
+    }
+}
+
+/**
+ * Read data cluster
+ *
+ * @opaque:     Read request
+ * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
+ *              or -errno
+ * @offset:     Cluster offset in bytes
+ * @len:        Length in bytes
+ *
+ * Callback from qed_find_cluster().
+ */
+static void qed_aio_read_data(void *opaque, int ret,
+                              uint64_t offset, size_t len)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    BlockDriverState *bs = acb->common.bs;
+
+    /* Adjust offset into cluster */
+    offset += qed_offset_into_cluster(s, acb->cur_pos);
+
+    trace_qed_aio_read_data(s, acb, ret, offset, len);
+
+    if (ret < 0) {
+        goto err;
+    }
+
+    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+    /* Handle zero cluster and backing file reads */
+    if (ret == QED_CLUSTER_ZERO) {
+        qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
+        qed_aio_next_io(acb, 0);
+        return;
+    } else if (ret != QED_CLUSTER_FOUND) {
+        qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
+                              qed_aio_next_io, acb);
+        return;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+    bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
+                   &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+                   qed_aio_next_io, acb);
+    return;
+
+err:
+    qed_aio_complete(acb, ret);
+}
+
+/**
+ * Begin next I/O or complete the request
+ */
+static void qed_aio_next_io(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
+                                qed_aio_write_data : qed_aio_read_data;
+
+    trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
+
+    /* Handle I/O error */
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    acb->qiov_offset += acb->cur_qiov.size;
+    acb->cur_pos += acb->cur_qiov.size;
+    qemu_iovec_reset(&acb->cur_qiov);
+
+    /* Complete request */
+    if (acb->cur_pos >= acb->end_pos) {
+        qed_aio_complete(acb, 0);
+        return;
+    }
+
+    /* Find next cluster and start I/O */
+    qed_find_cluster(s, &acb->request,
+                      acb->cur_pos, acb->end_pos - acb->cur_pos,
+                      io_fn, acb);
+}
+
+static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
+                                       int64_t sector_num,
+                                       QEMUIOVector *qiov, int nb_sectors,
+                                       BlockDriverCompletionFunc *cb,
+                                       void *opaque, int flags)
+{
+    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
+
+    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
+                        opaque, flags);
+
+    acb->flags = flags;
+    acb->finished = NULL;
+    acb->qiov = qiov;
+    acb->qiov_offset = 0;
+    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
+    acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
+    acb->request.l2_table = NULL;
+    qemu_iovec_init(&acb->cur_qiov, qiov->niov);
+
+    /* Start request */
+    qed_aio_next_io(acb, 0);
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            QEMUIOVector *qiov, int nb_sectors,
+                                            BlockDriverCompletionFunc *cb,
+                                            void *opaque)
+{
+    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
+                                             int64_t sector_num,
+                                             QEMUIOVector *qiov, int nb_sectors,
+                                             BlockDriverCompletionFunc *cb,
+                                             void *opaque)
+{
+    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
+                         opaque, QED_AIOCB_WRITE);
+}
+
+typedef struct {
+    Coroutine *co;
+    int ret;
+    bool done;
+} QEDWriteZeroesCB;
+
+static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
+{
+    QEDWriteZeroesCB *cb = opaque;
+
+    cb->done = true;
+    cb->ret = ret;
+    if (cb->co) {
+        qemu_coroutine_enter(cb->co, NULL);
+    }
+}
+
+static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
+                                                 int64_t sector_num,
+                                                 int nb_sectors)
+{
+    BlockDriverAIOCB *blockacb;
+    BDRVQEDState *s = bs->opaque;
+    QEDWriteZeroesCB cb = { .done = false };
+    QEMUIOVector qiov;
+    struct iovec iov;
+
+    /* Refuse if there are untouched backing file sectors */
+    if (bs->backing_hd) {
+        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
+            return -ENOTSUP;
+        }
+        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
+            return -ENOTSUP;
+        }
+    }
+
+    /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
+     * then it will be allocated during request processing.
+     */
+    iov.iov_base = NULL,
+    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
+                             qed_co_write_zeroes_cb, &cb,
+                             QED_AIOCB_WRITE | QED_AIOCB_ZERO);
+    if (!blockacb) {
+        return -EIO;
+    }
+    if (!cb.done) {
+        cb.co = qemu_coroutine_self();
+        qemu_coroutine_yield();
+    }
+    assert(cb.done);
+    return cb.ret;
+}
+
+static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset)
+{
+    BDRVQEDState *s = bs->opaque;
+    uint64_t old_image_size;
+    int ret;
+
+    if (!qed_is_image_size_valid(offset, s->header.cluster_size,
+                                 s->header.table_size)) {
+        return -EINVAL;
+    }
+
+    /* Shrinking is currently not supported */
+    if ((uint64_t)offset < s->header.image_size) {
+        return -ENOTSUP;
+    }
+
+    old_image_size = s->header.image_size;
+    s->header.image_size = offset;
+    ret = qed_write_header_sync(s);
+    if (ret < 0) {
+        s->header.image_size = old_image_size;
+    }
+    return ret;
+}
+
+static int64_t bdrv_qed_getlength(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+    return s->header.image_size;
+}
+
+static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    memset(bdi, 0, sizeof(*bdi));
+    bdi->cluster_size = s->header.cluster_size;
+    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
+    return 0;
+}
+
+static int bdrv_qed_change_backing_file(BlockDriverState *bs,
+                                        const char *backing_file,
+                                        const char *backing_fmt)
+{
+    BDRVQEDState *s = bs->opaque;
+    QEDHeader new_header, le_header;
+    void *buffer;
+    size_t buffer_len, backing_file_len;
+    int ret;
+
+    /* Refuse to set backing filename if unknown compat feature bits are
+     * active.  If the image uses an unknown compat feature then we may not
+     * know the layout of data following the header structure and cannot safely
+     * add a new string.
+     */
+    if (backing_file && (s->header.compat_features &
+                         ~QED_COMPAT_FEATURE_MASK)) {
+        return -ENOTSUP;
+    }
+
+    memcpy(&new_header, &s->header, sizeof(new_header));
+
+    new_header.features &= ~(QED_F_BACKING_FILE |
+                             QED_F_BACKING_FORMAT_NO_PROBE);
+
+    /* Adjust feature flags */
+    if (backing_file) {
+        new_header.features |= QED_F_BACKING_FILE;
+
+        if (qed_fmt_is_raw(backing_fmt)) {
+            new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
+        }
+    }
+
+    /* Calculate new header size */
+    backing_file_len = 0;
+
+    if (backing_file) {
+        backing_file_len = strlen(backing_file);
+    }
+
+    buffer_len = sizeof(new_header);
+    new_header.backing_filename_offset = buffer_len;
+    new_header.backing_filename_size = backing_file_len;
+    buffer_len += backing_file_len;
+
+    /* Make sure we can rewrite header without failing */
+    if (buffer_len > new_header.header_size * new_header.cluster_size) {
+        return -ENOSPC;
+    }
+
+    /* Prepare new header */
+    buffer = g_malloc(buffer_len);
+
+    qed_header_cpu_to_le(&new_header, &le_header);
+    memcpy(buffer, &le_header, sizeof(le_header));
+    buffer_len = sizeof(le_header);
+
+    if (backing_file) {
+        memcpy(buffer + buffer_len, backing_file, backing_file_len);
+        buffer_len += backing_file_len;
+    }
+
+    /* Write new header */
+    ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
+    g_free(buffer);
+    if (ret == 0) {
+        memcpy(&s->header, &new_header, sizeof(new_header));
+    }
+    return ret;
+}
+
+static void bdrv_qed_invalidate_cache(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    bdrv_qed_close(bs);
+    memset(s, 0, sizeof(BDRVQEDState));
+    bdrv_qed_open(bs, NULL, bs->open_flags);
+}
+
+static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
+                          BdrvCheckMode fix)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    return qed_check(s, result, !!fix);
+}
+
+static QEMUOptionParameter qed_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size (in bytes)"
+    }, {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    }, {
+        .name = BLOCK_OPT_BACKING_FMT,
+        .type = OPT_STRING,
+        .help = "Image format of the base image"
+    }, {
+        .name = BLOCK_OPT_CLUSTER_SIZE,
+        .type = OPT_SIZE,
+        .help = "Cluster size (in bytes)",
+        .value = { .n = QED_DEFAULT_CLUSTER_SIZE },
+    }, {
+        .name = BLOCK_OPT_TABLE_SIZE,
+        .type = OPT_SIZE,
+        .help = "L1/L2 table size (in clusters)"
+    },
+    { /* end of list */ }
+};
+
+static BlockDriver bdrv_qed = {
+    .format_name              = "qed",
+    .instance_size            = sizeof(BDRVQEDState),
+    .create_options           = qed_create_options,
+
+    .bdrv_probe               = bdrv_qed_probe,
+    .bdrv_rebind              = bdrv_qed_rebind,
+    .bdrv_open                = bdrv_qed_open,
+    .bdrv_close               = bdrv_qed_close,
+    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
+    .bdrv_create              = bdrv_qed_create,
+    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
+    .bdrv_co_is_allocated     = bdrv_qed_co_is_allocated,
+    .bdrv_make_empty          = bdrv_qed_make_empty,
+    .bdrv_aio_readv           = bdrv_qed_aio_readv,
+    .bdrv_aio_writev          = bdrv_qed_aio_writev,
+    .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
+    .bdrv_truncate            = bdrv_qed_truncate,
+    .bdrv_getlength           = bdrv_qed_getlength,
+    .bdrv_get_info            = bdrv_qed_get_info,
+    .bdrv_change_backing_file = bdrv_qed_change_backing_file,
+    .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache,
+    .bdrv_check               = bdrv_qed_check,
+};
+
+static void bdrv_qed_init(void)
+{
+    bdrv_register(&bdrv_qed);
+}
+
+block_init(bdrv_qed_init);
diff --git a/contrib/qemu/block/qed.h b/contrib/qemu/block/qed.h
new file mode 100644
index 000000000..2b4ddedf3
--- /dev/null
+++ b/contrib/qemu/block/qed.h
@@ -0,0 +1,344 @@
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_QED_H
+#define BLOCK_QED_H
+
+#include "block/block_int.h"
+
+/* The layout of a QED file is as follows:
+ *
+ * +--------+----------+----------+----------+-----+
+ * | header | L1 table | cluster0 | cluster1 | ... |
+ * +--------+----------+----------+----------+-----+
+ *
+ * There is a 2-level pagetable for cluster allocation:
+ *
+ *                     +----------+
+ *                     | L1 table |
+ *                     +----------+
+ *                ,------'  |  '------.
+ *           +----------+   |    +----------+
+ *           | L2 table |  ...   | L2 table |
+ *           +----------+        +----------+
+ *       ,------'  |  '------.
+ *  +----------+   |    +----------+
+ *  |   Data   |  ...   |   Data   |
+ *  +----------+        +----------+
+ *
+ * The L1 table is fixed size and always present.  L2 tables are allocated on
+ * demand.  The L1 table size determines the maximum possible image size; it
+ * can be influenced using the cluster_size and table_size values.
+ *
+ * All fields are little-endian on disk.
+ */
+
+enum {
+    QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24,
+
+    /* The image supports a backing file */
+    QED_F_BACKING_FILE = 0x01,
+
+    /* The image needs a consistency check before use */
+    QED_F_NEED_CHECK = 0x02,
+
+    /* The backing file format must not be probed, treat as raw image */
+    QED_F_BACKING_FORMAT_NO_PROBE = 0x04,
+
+    /* Feature bits must be used when the on-disk format changes */
+    QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */
+                       QED_F_NEED_CHECK |
+                       QED_F_BACKING_FORMAT_NO_PROBE,
+    QED_COMPAT_FEATURE_MASK = 0,            /* supported compat feature bits */
+    QED_AUTOCLEAR_FEATURE_MASK = 0,         /* supported autoclear feature bits */
+
+    /* Data is stored in groups of sectors called clusters.  Cluster size must
+     * be large to avoid keeping too much metadata.  I/O requests that have
+     * sub-cluster size will require read-modify-write.
+     */
+    QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */
+    QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024,
+    QED_DEFAULT_CLUSTER_SIZE = 64 * 1024,
+
+    /* Allocated clusters are tracked using a 2-level pagetable.  Table size is
+     * a multiple of clusters so large maximum image sizes can be supported
+     * without jacking up the cluster size too much.
+     */
+    QED_MIN_TABLE_SIZE = 1,        /* in clusters */
+    QED_MAX_TABLE_SIZE = 16,
+    QED_DEFAULT_TABLE_SIZE = 4,
+
+    /* Delay to flush and clean image after last allocating write completes */
+    QED_NEED_CHECK_TIMEOUT = 5,    /* in seconds */
+};
+
+typedef struct {
+    uint32_t magic;                 /* QED\0 */
+
+    uint32_t cluster_size;          /* in bytes */
+    uint32_t table_size;            /* for L1 and L2 tables, in clusters */
+    uint32_t header_size;           /* in clusters */
+
+    uint64_t features;              /* format feature bits */
+    uint64_t compat_features;       /* compatible feature bits */
+    uint64_t autoclear_features;    /* self-resetting feature bits */
+
+    uint64_t l1_table_offset;       /* in bytes */
+    uint64_t image_size;            /* total logical image size, in bytes */
+
+    /* if (features & QED_F_BACKING_FILE) */
+    uint32_t backing_filename_offset; /* in bytes from start of header */
+    uint32_t backing_filename_size;   /* in bytes */
+} QEDHeader;
+
+typedef struct {
+    uint64_t offsets[0];            /* in bytes */
+} QEDTable;
+
+/* The L2 cache is a simple write-through cache for L2 structures */
+typedef struct CachedL2Table {
+    QEDTable *table;
+    uint64_t offset;    /* offset=0 indicates an invalidate entry */
+    QTAILQ_ENTRY(CachedL2Table) node;
+    int ref;
+} CachedL2Table;
+
+typedef struct {
+    QTAILQ_HEAD(, CachedL2Table) entries;
+    unsigned int n_entries;
+} L2TableCache;
+
+typedef struct QEDRequest {
+    CachedL2Table *l2_table;
+} QEDRequest;
+
+enum {
+    QED_AIOCB_WRITE = 0x0001,       /* read or write? */
+    QED_AIOCB_ZERO  = 0x0002,       /* zero write, used with QED_AIOCB_WRITE */
+};
+
+typedef struct QEDAIOCB {
+    BlockDriverAIOCB common;
+    QEMUBH *bh;
+    int bh_ret;                     /* final return status for completion bh */
+    QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
+    int flags;                      /* QED_AIOCB_* bits ORed together */
+    bool *finished;                 /* signal for cancel completion */
+    uint64_t end_pos;               /* request end on block device, in bytes */
+
+    /* User scatter-gather list */
+    QEMUIOVector *qiov;
+    size_t qiov_offset;             /* byte count already processed */
+
+    /* Current cluster scatter-gather list */
+    QEMUIOVector cur_qiov;
+    uint64_t cur_pos;               /* position on block device, in bytes */
+    uint64_t cur_cluster;           /* cluster offset in image file */
+    unsigned int cur_nclusters;     /* number of clusters being accessed */
+    int find_cluster_ret;           /* used for L1/L2 update */
+
+    QEDRequest request;
+} QEDAIOCB;
+
+typedef struct {
+    BlockDriverState *bs;           /* device */
+    uint64_t file_size;             /* length of image file, in bytes */
+
+    QEDHeader header;               /* always cpu-endian */
+    QEDTable *l1_table;
+    L2TableCache l2_cache;          /* l2 table cache */
+    uint32_t table_nelems;
+    uint32_t l1_shift;
+    uint32_t l2_shift;
+    uint32_t l2_mask;
+
+    /* Allocating write request queue */
+    QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs;
+    bool allocating_write_reqs_plugged;
+
+    /* Periodic flush and clear need check flag */
+    QEMUTimer *need_check_timer;
+} BDRVQEDState;
+
+enum {
+    QED_CLUSTER_FOUND,         /* cluster found */
+    QED_CLUSTER_ZERO,          /* zero cluster found */
+    QED_CLUSTER_L2,            /* cluster missing in L2 */
+    QED_CLUSTER_L1,            /* cluster missing in L1 */
+};
+
+/**
+ * qed_find_cluster() completion callback
+ *
+ * @opaque:     User data for completion callback
+ * @ret:        QED_CLUSTER_FOUND   Success
+ *              QED_CLUSTER_L2      Data cluster unallocated in L2
+ *              QED_CLUSTER_L1      L2 unallocated in L1
+ *              -errno              POSIX error occurred
+ * @offset:     Data cluster offset
+ * @len:        Contiguous bytes starting from cluster offset
+ *
+ * This function is invoked when qed_find_cluster() completes.
+ *
+ * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range
+ * in the image file.
+ *
+ * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1
+ * table offset, respectively.  len is number of contiguous unallocated bytes.
+ */
+typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
+
+/**
+ * Generic callback for chaining async callbacks
+ */
+typedef struct {
+    BlockDriverCompletionFunc *cb;
+    void *opaque;
+} GenericCB;
+
+void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque);
+void gencb_complete(void *opaque, int ret);
+
+/**
+ * Header functions
+ */
+int qed_write_header_sync(BDRVQEDState *s);
+
+/**
+ * L2 cache functions
+ */
+void qed_init_l2_cache(L2TableCache *l2_cache);
+void qed_free_l2_cache(L2TableCache *l2_cache);
+CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache);
+void qed_unref_l2_cache_entry(CachedL2Table *entry);
+CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset);
+void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);
+
+/**
+ * Table I/O functions
+ */
+int qed_read_l1_table_sync(BDRVQEDState *s);
+void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
+                        BlockDriverCompletionFunc *cb, void *opaque);
+int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
+                            unsigned int n);
+int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
+                           uint64_t offset);
+void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
+                       BlockDriverCompletionFunc *cb, void *opaque);
+void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+                        unsigned int index, unsigned int n, bool flush,
+                        BlockDriverCompletionFunc *cb, void *opaque);
+int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
+                            unsigned int index, unsigned int n, bool flush);
+
+/**
+ * Cluster functions
+ */
+void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+                      size_t len, QEDFindClusterFunc *cb, void *opaque);
+
+/**
+ * Consistency check
+ */
+int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix);
+
+QEDTable *qed_alloc_table(BDRVQEDState *s);
+
+/**
+ * Round down to the start of a cluster
+ */
+static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset)
+{
+    return offset & ~(uint64_t)(s->header.cluster_size - 1);
+}
+
+static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset)
+{
+    return offset & (s->header.cluster_size - 1);
+}
+
+static inline uint64_t qed_bytes_to_clusters(BDRVQEDState *s, uint64_t bytes)
+{
+    return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) /
+           (s->header.cluster_size - 1);
+}
+
+static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos)
+{
+    return pos >> s->l1_shift;
+}
+
+static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos)
+{
+    return (pos >> s->l2_shift) & s->l2_mask;
+}
+
+/**
+ * Test if a cluster offset is valid
+ */
+static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset)
+{
+    uint64_t header_size = (uint64_t)s->header.header_size *
+                           s->header.cluster_size;
+
+    if (offset & (s->header.cluster_size - 1)) {
+        return false;
+    }
+    return offset >= header_size && offset < s->file_size;
+}
+
+/**
+ * Test if a table offset is valid
+ */
+static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset)
+{
+    uint64_t end_offset = offset + (s->header.table_size - 1) *
+                          s->header.cluster_size;
+
+    /* Overflow check */
+    if (end_offset <= offset) {
+        return false;
+    }
+
+    return qed_check_cluster_offset(s, offset) &&
+           qed_check_cluster_offset(s, end_offset);
+}
+
+static inline bool qed_offset_is_cluster_aligned(BDRVQEDState *s,
+                                                 uint64_t offset)
+{
+    if (qed_offset_into_cluster(s, offset)) {
+        return false;
+    }
+    return true;
+}
+
+static inline bool qed_offset_is_unalloc_cluster(uint64_t offset)
+{
+    if (offset == 0) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool qed_offset_is_zero_cluster(uint64_t offset)
+{
+    if (offset == 1) {
+        return true;
+    }
+    return false;
+}
+
+#endif /* BLOCK_QED_H */
diff --git a/contrib/qemu/block/snapshot.c b/contrib/qemu/block/snapshot.c
new file mode 100644
index 000000000..6c6d9deea
--- /dev/null
+++ b/contrib/qemu/block/snapshot.c
@@ -0,0 +1,157 @@
+/*
+ * Block layer snapshot related functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block/snapshot.h"
+#include "block/block_int.h"
+
+int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info,
+                       const char *name)
+{
+    QEMUSnapshotInfo *sn_tab, *sn;
+    int nb_sns, i, ret;
+
+    ret = -ENOENT;
+    nb_sns = bdrv_snapshot_list(bs, &sn_tab);
+    if (nb_sns < 0) {
+        return ret;
+    }
+    for (i = 0; i < nb_sns; i++) {
+        sn = &sn_tab[i];
+        if (!strcmp(sn->id_str, name) || !strcmp(sn->name, name)) {
+            *sn_info = *sn;
+            ret = 0;
+            break;
+        }
+    }
+    g_free(sn_tab);
+    return ret;
+}
+
+int bdrv_can_snapshot(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+        return 0;
+    }
+
+    if (!drv->bdrv_snapshot_create) {
+        if (bs->file != NULL) {
+            return bdrv_can_snapshot(bs->file);
+        }
+        return 0;
+    }
+
+    return 1;
+}
+
+int bdrv_snapshot_create(BlockDriverState *bs,
+                         QEMUSnapshotInfo *sn_info)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_create) {
+        return drv->bdrv_snapshot_create(bs, sn_info);
+    }
+    if (bs->file) {
+        return bdrv_snapshot_create(bs->file, sn_info);
+    }
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_goto(BlockDriverState *bs,
+                       const char *snapshot_id)
+{
+    BlockDriver *drv = bs->drv;
+    int ret, open_ret;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_goto) {
+        return drv->bdrv_snapshot_goto(bs, snapshot_id);
+    }
+
+    if (bs->file) {
+        drv->bdrv_close(bs);
+        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
+        open_ret = drv->bdrv_open(bs, NULL, bs->open_flags);
+        if (open_ret < 0) {
+            bdrv_delete(bs->file);
+            bs->drv = NULL;
+            return open_ret;
+        }
+        return ret;
+    }
+
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_delete) {
+        return drv->bdrv_snapshot_delete(bs, snapshot_id);
+    }
+    if (bs->file) {
+        return bdrv_snapshot_delete(bs->file, snapshot_id);
+    }
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_list(BlockDriverState *bs,
+                       QEMUSnapshotInfo **psn_info)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_list) {
+        return drv->bdrv_snapshot_list(bs, psn_info);
+    }
+    if (bs->file) {
+        return bdrv_snapshot_list(bs->file, psn_info);
+    }
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_load_tmp(BlockDriverState *bs,
+        const char *snapshot_name)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (!bs->read_only) {
+        return -EINVAL;
+    }
+    if (drv->bdrv_snapshot_load_tmp) {
+        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
+    }
+    return -ENOTSUP;
+}
diff --git a/contrib/qemu/config-host.h b/contrib/qemu/config-host.h
new file mode 100644
index 000000000..46b1595a8
--- /dev/null
+++ b/contrib/qemu/config-host.h
@@ -0,0 +1,73 @@
+/* Automatically generated by create_config - do not modify */
+#define CONFIG_QEMU_CONFDIR "/usr/local/etc/qemu"
+#define CONFIG_QEMU_DATADIR "/usr/local/share/qemu"
+#define CONFIG_QEMU_DOCDIR "/usr/local/share/doc/qemu"
+#define CONFIG_QEMU_LOCALSTATEDIR "/usr/local/var"
+#define CONFIG_QEMU_HELPERDIR "/usr/local/libexec"
+#define CONFIG_QEMU_LOCALEDIR "/usr/local/share/locale"
+#define HOST_X86_64 1
+#define CONFIG_QEMU_LDST_OPTIMIZATION 1
+#define CONFIG_POSIX 1
+#define CONFIG_LINUX 1
+#define CONFIG_SLIRP 1
+#define CONFIG_SMBD_COMMAND "/usr/sbin/smbd"
+#define CONFIG_AUDIO_DRIVERS \
+    &oss_audio_driver,\
+
+#define CONFIG_OSS 1
+#define CONFIG_BDRV_RW_WHITELIST\
+    NULL
+#define CONFIG_BDRV_RO_WHITELIST\
+    NULL
+#define CONFIG_VNC 1
+#define CONFIG_VNC_TLS 1
+#define CONFIG_VNC_SASL 1
+#define CONFIG_VNC_WS 1
+#define CONFIG_FNMATCH 1
+#define CONFIG_UUID 1
+#define CONFIG_XFS 1
+#define QEMU_VERSION "1.5.50"
+#define QEMU_PKGVERSION ""
+#define CONFIG_CURSES 1
+#define CONFIG_UTIMENSAT 1
+#define CONFIG_PIPE2 1
+#define CONFIG_ACCEPT4 1
+#define CONFIG_SPLICE 1
+#define CONFIG_EVENTFD 1
+#define CONFIG_FALLOCATE 1
+#define CONFIG_FALLOCATE_PUNCH_HOLE 1
+#define CONFIG_SYNC_FILE_RANGE 1
+#define CONFIG_FIEMAP 1
+#define CONFIG_DUP3 1
+#define CONFIG_EPOLL 1
+#define CONFIG_EPOLL_CREATE1 1
+#define CONFIG_EPOLL_PWAIT 1
+#define CONFIG_SENDFILE 1
+#define CONFIG_INOTIFY 1
+#define CONFIG_INOTIFY1 1
+#define CONFIG_BYTESWAP_H 1
+#define CONFIG_CURL 1
+#define CONFIG_LINUX_AIO 1
+#define CONFIG_ATTR 1
+#define CONFIG_VHOST_SCSI 1
+#define CONFIG_IOVEC 1
+#define CONFIG_PREADV 1
+#define CONFIG_FDT 1
+#define CONFIG_SIGNALFD 1
+#define CONFIG_FDATASYNC 1
+#define CONFIG_MADVISE 1
+#define CONFIG_POSIX_MADVISE 1
+#define CONFIG_SIGEV_THREAD_ID 1
+#define CONFIG_UNAME_RELEASE ""
+#define CONFIG_QOM_CAST_DEBUG 1
+#define CONFIG_COROUTINE_BACKEND ucontext
+#define CONFIG_OPEN_BY_HANDLE 1
+#define CONFIG_LINUX_MAGIC_H 1
+#define CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE 1
+#define CONFIG_HAS_ENVIRON 1
+#define CONFIG_CPUID_H 1
+#define CONFIG_INT128 1
+#define CONFIG_VIRTIO_BLK_DATA_PLANE $(CONFIG_VIRTIO)
+#define CONFIG_TRACE_NOP 1
+#define CONFIG_TRACE_FILE trace
+#define CONFIG_TRACE_DEFAULT 1
diff --git a/contrib/qemu/coroutine-ucontext.c b/contrib/qemu/coroutine-ucontext.c
new file mode 100644
index 000000000..4bf2cde27
--- /dev/null
+++ b/contrib/qemu/coroutine-ucontext.c
@@ -0,0 +1,225 @@
+/*
+ * ucontext coroutine initialization code
+ *
+ * Copyright (C) 2006  Anthony Liguori <anthony@codemonkey.ws>
+ * Copyright (C) 2011  Kevin Wolf <kwolf@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.0 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */
+#ifdef _FORTIFY_SOURCE
+#undef _FORTIFY_SOURCE
+#endif
+#include <stdlib.h>
+#include <setjmp.h>
+#include <stdint.h>
+#include <pthread.h>
+#include <ucontext.h>
+#include "qemu-common.h"
+#include "block/coroutine_int.h"
+
+#ifdef CONFIG_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
+typedef struct {
+    Coroutine base;
+    void *stack;
+    sigjmp_buf env;
+
+#ifdef CONFIG_VALGRIND_H
+    unsigned int valgrind_stack_id;
+#endif
+
+} CoroutineUContext;
+
+/**
+ * Per-thread coroutine bookkeeping
+ */
+typedef struct {
+    /** Currently executing coroutine */
+    Coroutine *current;
+
+    /** The default coroutine */
+    CoroutineUContext leader;
+} CoroutineThreadState;
+
+static pthread_key_t thread_state_key;
+
+/*
+ * va_args to makecontext() must be type 'int', so passing
+ * the pointer we need may require several int args. This
+ * union is a quick hack to let us do that
+ */
+union cc_arg {
+    void *p;
+    int i[2];
+};
+
+static CoroutineThreadState *coroutine_get_thread_state(void)
+{
+    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
+
+    if (!s) {
+        s = g_malloc0(sizeof(*s));
+        s->current = &s->leader.base;
+        pthread_setspecific(thread_state_key, s);
+    }
+    return s;
+}
+
+static void qemu_coroutine_thread_cleanup(void *opaque)
+{
+    CoroutineThreadState *s = opaque;
+
+    g_free(s);
+}
+
+static void __attribute__((constructor)) coroutine_init(void)
+{
+    int ret;
+
+    ret = pthread_key_create(&thread_state_key, qemu_coroutine_thread_cleanup);
+    if (ret != 0) {
+        fprintf(stderr, "unable to create leader key: %s\n", strerror(errno));
+        abort();
+    }
+}
+
+static void coroutine_trampoline(int i0, int i1)
+{
+    union cc_arg arg;
+    CoroutineUContext *self;
+    Coroutine *co;
+
+    arg.i[0] = i0;
+    arg.i[1] = i1;
+    self = arg.p;
+    co = &self->base;
+
+    /* Initialize longjmp environment and switch back the caller */
+    if (!sigsetjmp(self->env, 0)) {
+        siglongjmp(*(sigjmp_buf *)co->entry_arg, 1);
+    }
+
+    while (true) {
+        co->entry(co->entry_arg);
+        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+    }
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+    const size_t stack_size = 1 << 20;
+    CoroutineUContext *co;
+    ucontext_t old_uc, uc;
+    sigjmp_buf old_env;
+    union cc_arg arg = {0};
+
+    /* The ucontext functions preserve signal masks which incurs a
+     * system call overhead.  sigsetjmp(buf, 0)/siglongjmp() does not
+     * preserve signal masks but only works on the current stack.
+     * Since we need a way to create and switch to a new stack, use
+     * the ucontext functions for that but sigsetjmp()/siglongjmp() for
+     * everything else.
+     */
+
+    if (getcontext(&uc) == -1) {
+        abort();
+    }
+
+    co = g_malloc0(sizeof(*co));
+    co->stack = g_malloc(stack_size);
+    co->base.entry_arg = &old_env; /* stash away our jmp_buf */
+
+    uc.uc_link = &old_uc;
+    uc.uc_stack.ss_sp = co->stack;
+    uc.uc_stack.ss_size = stack_size;
+    uc.uc_stack.ss_flags = 0;
+
+#ifdef CONFIG_VALGRIND_H
+    co->valgrind_stack_id =
+        VALGRIND_STACK_REGISTER(co->stack, co->stack + stack_size);
+#endif
+
+    arg.p = co;
+
+    makecontext(&uc, (void (*)(void))coroutine_trampoline,
+                2, arg.i[0], arg.i[1]);
+
+    /* swapcontext() in, siglongjmp() back out */
+    if (!sigsetjmp(old_env, 0)) {
+        swapcontext(&old_uc, &uc);
+    }
+    return &co->base;
+}
+
+#ifdef CONFIG_VALGRIND_H
+#ifdef CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE
+/* Work around an unused variable in the valgrind.h macro... */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+static inline void valgrind_stack_deregister(CoroutineUContext *co)
+{
+    VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id);
+}
+#ifdef CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+    CoroutineUContext *co = DO_UPCAST(CoroutineUContext, base, co_);
+
+#ifdef CONFIG_VALGRIND_H
+    valgrind_stack_deregister(co);
+#endif
+
+    g_free(co->stack);
+    g_free(co);
+}
+
+CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+                                      CoroutineAction action)
+{
+    CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_);
+    CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_);
+    CoroutineThreadState *s = coroutine_get_thread_state();
+    int ret;
+
+    s->current = to_;
+
+    ret = sigsetjmp(from->env, 0);
+    if (ret == 0) {
+        siglongjmp(to->env, action);
+    }
+    return ret;
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+    CoroutineThreadState *s = coroutine_get_thread_state();
+
+    return s->current;
+}
+
+bool qemu_in_coroutine(void)
+{
+    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
+
+    return s && s->current->caller;
+}
diff --git a/contrib/qemu/include/block/aio.h b/contrib/qemu/include/block/aio.h
new file mode 100644
index 000000000..183679374
--- /dev/null
+++ b/contrib/qemu/include/block/aio.h
@@ -0,0 +1,247 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_AIO_H
+#define QEMU_AIO_H
+
+#include "qemu-common.h"
+#include "qemu/queue.h"
+#include "qemu/event_notifier.h"
+
+typedef struct BlockDriverAIOCB BlockDriverAIOCB;
+typedef void BlockDriverCompletionFunc(void *opaque, int ret);
+
+typedef struct AIOCBInfo {
+    void (*cancel)(BlockDriverAIOCB *acb);
+    size_t aiocb_size;
+} AIOCBInfo;
+
+struct BlockDriverAIOCB {
+    const AIOCBInfo *aiocb_info;
+    BlockDriverState *bs;
+    BlockDriverCompletionFunc *cb;
+    void *opaque;
+};
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+                   BlockDriverCompletionFunc *cb, void *opaque);
+void qemu_aio_release(void *p);
+
+typedef struct AioHandler AioHandler;
+typedef void QEMUBHFunc(void *opaque);
+typedef void IOHandler(void *opaque);
+
+typedef struct AioContext {
+    GSource source;
+
+    /* The list of registered AIO handlers */
+    QLIST_HEAD(, AioHandler) aio_handlers;
+
+    /* This is a simple lock used to protect the aio_handlers list.
+     * Specifically, it's used to ensure that no callbacks are removed while
+     * we're walking and dispatching callbacks.
+     */
+    int walking_handlers;
+
+    /* Anchor of the list of Bottom Halves belonging to the context */
+    struct QEMUBH *first_bh;
+
+    /* A simple lock used to protect the first_bh list, and ensure that
+     * no callbacks are removed while we're walking and dispatching callbacks.
+     */
+    int walking_bh;
+
+    /* Used for aio_notify.  */
+    EventNotifier notifier;
+
+    /* GPollFDs for aio_poll() */
+    GArray *pollfds;
+
+    /* Thread pool for performing work and receiving completion callbacks */
+    struct ThreadPool *thread_pool;
+} AioContext;
+
+/* Returns 1 if there are still outstanding AIO requests; 0 otherwise */
+typedef int (AioFlushEventNotifierHandler)(EventNotifier *e);
+
+/**
+ * aio_context_new: Allocate a new AioContext.
+ *
+ * AioContext provide a mini event-loop that can be waited on synchronously.
+ * They also provide bottom halves, a service to execute a piece of code
+ * as soon as possible.
+ */
+AioContext *aio_context_new(void);
+
+/**
+ * aio_context_ref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Add a reference to an AioContext.
+ */
+void aio_context_ref(AioContext *ctx);
+
+/**
+ * aio_context_unref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Drop a reference to an AioContext.
+ */
+void aio_context_unref(AioContext *ctx);
+
+/**
+ * aio_bh_new: Allocate a new bottom half structure.
+ *
+ * Bottom halves are lightweight callbacks whose invocation is guaranteed
+ * to be wait-free, thread-safe and signal-safe.  The #QEMUBH structure
+ * is opaque and must be allocated prior to its use.
+ */
+QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque);
+
+/**
+ * aio_notify: Force processing of pending events.
+ *
+ * Similar to signaling a condition variable, aio_notify forces
+ * aio_wait to exit, so that the next call will re-examine pending events.
+ * The caller of aio_notify will usually call aio_wait again very soon,
+ * or go through another iteration of the GLib main loop.  Hence, aio_notify
+ * also has the side effect of recalculating the sets of file descriptors
+ * that the main loop waits for.
+ *
+ * Calling aio_notify is rarely necessary, because for example scheduling
+ * a bottom half calls it already.
+ */
+void aio_notify(AioContext *ctx);
+
+/**
+ * aio_bh_poll: Poll bottom halves for an AioContext.
+ *
+ * These are internal functions used by the QEMU main loop.
+ */
+int aio_bh_poll(AioContext *ctx);
+
+/**
+ * qemu_bh_schedule: Schedule a bottom half.
+ *
+ * Scheduling a bottom half interrupts the main loop and causes the
+ * execution of the callback that was passed to qemu_bh_new.
+ *
+ * Bottom halves that are scheduled from a bottom half handler are instantly
+ * invoked.  This can create an infinite loop if a bottom half handler
+ * schedules itself.
+ *
+ * @bh: The bottom half to be scheduled.
+ */
+void qemu_bh_schedule(QEMUBH *bh);
+
+/**
+ * qemu_bh_cancel: Cancel execution of a bottom half.
+ *
+ * Canceling execution of a bottom half undoes the effect of calls to
+ * qemu_bh_schedule without freeing its resources yet.  While cancellation
+ * itself is also wait-free and thread-safe, it can of course race with the
+ * loop that executes bottom halves unless you are holding the iothread
+ * mutex.  This makes it mostly useless if you are not holding the mutex.
+ *
+ * @bh: The bottom half to be canceled.
+ */
+void qemu_bh_cancel(QEMUBH *bh);
+
+/**
+ *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
+ *
+ * Deleting a bottom half frees the memory that was allocated for it by
+ * qemu_bh_new.  It also implies canceling the bottom half if it was
+ * scheduled.
+ *
+ * @bh: The bottom half to be deleted.
+ */
+void qemu_bh_delete(QEMUBH *bh);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_pending(AioContext *ctx);
+
+/* Progress in completing AIO work to occur.  This can issue new pending
+ * aio as a result of executing I/O completion or bh callbacks.
+ *
+ * If there is no pending AIO operation or completion (bottom half),
+ * return false.  If there are pending AIO operations of bottom halves,
+ * return true.
+ *
+ * If there are no pending bottom halves, but there are pending AIO
+ * operations, it may not be possible to make any progress without
+ * blocking.  If @blocking is true, this function will wait until one
+ * or more AIO events have completed, to ensure something has moved
+ * before returning.
+ */
+bool aio_poll(AioContext *ctx, bool blocking);
+
+#ifdef CONFIG_POSIX
+/* Returns 1 if there are still outstanding AIO requests; 0 otherwise */
+typedef int (AioFlushHandler)(void *opaque);
+
+/* Register a file descriptor and associated callbacks.  Behaves very similarly
+ * to qemu_set_fd_handler2.  Unlike qemu_set_fd_handler2, these callbacks will
+ * be invoked when using qemu_aio_wait().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of qemu_set_fd_handler[2].
+ */
+void aio_set_fd_handler(AioContext *ctx,
+                        int fd,
+                        IOHandler *io_read,
+                        IOHandler *io_write,
+                        AioFlushHandler *io_flush,
+                        void *opaque);
+#endif
+
+/* Register an event notifier and associated callbacks.  Behaves very similarly
+ * to event_notifier_set_handler.  Unlike event_notifier_set_handler, these callbacks
+ * will be invoked when using qemu_aio_wait().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of event_notifier_set_handler.
+ */
+void aio_set_event_notifier(AioContext *ctx,
+                            EventNotifier *notifier,
+                            EventNotifierHandler *io_read,
+                            AioFlushEventNotifierHandler *io_flush);
+
+/* Return a GSource that lets the main loop poll the file descriptors attached
+ * to this AioContext.
+ */
+GSource *aio_get_g_source(AioContext *ctx);
+
+/* Return the ThreadPool bound to this AioContext */
+struct ThreadPool *aio_get_thread_pool(AioContext *ctx);
+
+/* Functions to operate on the main QEMU AioContext.  */
+
+bool qemu_aio_wait(void);
+void qemu_aio_set_event_notifier(EventNotifier *notifier,
+                                 EventNotifierHandler *io_read,
+                                 AioFlushEventNotifierHandler *io_flush);
+
+#ifdef CONFIG_POSIX
+void qemu_aio_set_fd_handler(int fd,
+                             IOHandler *io_read,
+                             IOHandler *io_write,
+                             AioFlushHandler *io_flush,
+                             void *opaque);
+#endif
+
+#endif
diff --git a/contrib/qemu/include/block/block.h b/contrib/qemu/include/block/block.h
new file mode 100644
index 000000000..b6b9014a9
--- /dev/null
+++ b/contrib/qemu/include/block/block.h
@@ -0,0 +1,443 @@
+#ifndef BLOCK_H
+#define BLOCK_H
+
+#include "block/aio.h"
+#include "qemu-common.h"
+#include "qemu/option.h"
+#include "block/coroutine.h"
+#include "qapi/qmp/qobject.h"
+#include "qapi-types.h"
+
+/* block.c */
+typedef struct BlockDriver BlockDriver;
+typedef struct BlockJob BlockJob;
+
+typedef struct BlockDriverInfo {
+    /* in bytes, 0 if irrelevant */
+    int cluster_size;
+    /* offset at which the VM state can be saved (0 if not possible) */
+    int64_t vm_state_offset;
+    bool is_dirty;
+} BlockDriverInfo;
+
+typedef struct BlockFragInfo {
+    uint64_t allocated_clusters;
+    uint64_t total_clusters;
+    uint64_t fragmented_clusters;
+    uint64_t compressed_clusters;
+} BlockFragInfo;
+
+/* Callbacks for block device models */
+typedef struct BlockDevOps {
+    /*
+     * Runs when virtual media changed (monitor commands eject, change)
+     * Argument load is true on load and false on eject.
+     * Beware: doesn't run when a host device's physical media
+     * changes.  Sure would be useful if it did.
+     * Device models with removable media must implement this callback.
+     */
+    void (*change_media_cb)(void *opaque, bool load);
+    /*
+     * Runs when an eject request is issued from the monitor, the tray
+     * is closed, and the medium is locked.
+     * Device models that do not implement is_medium_locked will not need
+     * this callback.  Device models that can lock the medium or tray might
+     * want to implement the callback and unlock the tray when "force" is
+     * true, even if they do not support eject requests.
+     */
+    void (*eject_request_cb)(void *opaque, bool force);
+    /*
+     * Is the virtual tray open?
+     * Device models implement this only when the device has a tray.
+     */
+    bool (*is_tray_open)(void *opaque);
+    /*
+     * Is the virtual medium locked into the device?
+     * Device models implement this only when device has such a lock.
+     */
+    bool (*is_medium_locked)(void *opaque);
+    /*
+     * Runs when the size changed (e.g. monitor command block_resize)
+     */
+    void (*resize_cb)(void *opaque);
+} BlockDevOps;
+
+#define BDRV_O_RDWR        0x0002
+#define BDRV_O_SNAPSHOT    0x0008 /* open the file read only and save writes in a snapshot */
+#define BDRV_O_NOCACHE     0x0020 /* do not use the host page cache */
+#define BDRV_O_CACHE_WB    0x0040 /* use write-back caching */
+#define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the thread pool */
+#define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
+#define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
+#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
+#define BDRV_O_INCOMING    0x0800  /* consistency hint for incoming migration */
+#define BDRV_O_CHECK       0x1000  /* open solely for consistency check */
+#define BDRV_O_ALLOW_RDWR  0x2000  /* allow reopen to change from r/o to r/w */
+#define BDRV_O_UNMAP       0x4000  /* execute guest UNMAP/TRIM operations */
+
+#define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH)
+
+#define BDRV_SECTOR_BITS   9
+#define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
+#define BDRV_SECTOR_MASK   ~(BDRV_SECTOR_SIZE - 1)
+
+typedef enum {
+    BDRV_ACTION_REPORT, BDRV_ACTION_IGNORE, BDRV_ACTION_STOP
+} BlockErrorAction;
+
+typedef QSIMPLEQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
+
+typedef struct BDRVReopenState {
+    BlockDriverState *bs;
+    int flags;
+    void *opaque;
+} BDRVReopenState;
+
+
+void bdrv_iostatus_enable(BlockDriverState *bs);
+void bdrv_iostatus_reset(BlockDriverState *bs);
+void bdrv_iostatus_disable(BlockDriverState *bs);
+bool bdrv_iostatus_is_enabled(const BlockDriverState *bs);
+void bdrv_iostatus_set_err(BlockDriverState *bs, int error);
+void bdrv_info_print(Monitor *mon, const QObject *data);
+void bdrv_info(Monitor *mon, QObject **ret_data);
+void bdrv_stats_print(Monitor *mon, const QObject *data);
+void bdrv_info_stats(Monitor *mon, QObject **ret_data);
+
+/* disk I/O throttling */
+void bdrv_io_limits_enable(BlockDriverState *bs);
+void bdrv_io_limits_disable(BlockDriverState *bs);
+bool bdrv_io_limits_enabled(BlockDriverState *bs);
+
+void bdrv_init(void);
+void bdrv_init_with_whitelist(void);
+BlockDriver *bdrv_find_protocol(const char *filename,
+                                bool allow_protocol_prefix);
+BlockDriver *bdrv_find_format(const char *format_name);
+BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
+                                          bool readonly);
+int bdrv_create(BlockDriver *drv, const char* filename,
+    QEMUOptionParameter *options);
+int bdrv_create_file(const char* filename, QEMUOptionParameter *options);
+BlockDriverState *bdrv_new(const char *device_name);
+void bdrv_make_anon(BlockDriverState *bs);
+void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old);
+void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top);
+void bdrv_delete(BlockDriverState *bs);
+int bdrv_parse_cache_flags(const char *mode, int *flags);
+int bdrv_parse_discard_flags(const char *mode, int *flags);
+int bdrv_file_open(BlockDriverState **pbs, const char *filename,
+                   QDict *options, int flags);
+int bdrv_open_backing_file(BlockDriverState *bs, QDict *options);
+int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
+              int flags, BlockDriver *drv);
+BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
+                                    BlockDriverState *bs, int flags);
+int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp);
+int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp);
+int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
+                        BlockReopenQueue *queue, Error **errp);
+void bdrv_reopen_commit(BDRVReopenState *reopen_state);
+void bdrv_reopen_abort(BDRVReopenState *reopen_state);
+void bdrv_close(BlockDriverState *bs);
+void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify);
+int bdrv_attach_dev(BlockDriverState *bs, void *dev);
+void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev);
+void bdrv_detach_dev(BlockDriverState *bs, void *dev);
+void *bdrv_get_attached_dev(BlockDriverState *bs);
+void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
+                      void *opaque);
+void bdrv_dev_eject_request(BlockDriverState *bs, bool force);
+bool bdrv_dev_has_removable_media(BlockDriverState *bs);
+bool bdrv_dev_is_tray_open(BlockDriverState *bs);
+bool bdrv_dev_is_medium_locked(BlockDriverState *bs);
+int bdrv_read(BlockDriverState *bs, int64_t sector_num,
+              uint8_t *buf, int nb_sectors);
+int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
+                          uint8_t *buf, int nb_sectors);
+int bdrv_write(BlockDriverState *bs, int64_t sector_num,
+               const uint8_t *buf, int nb_sectors);
+int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov);
+int bdrv_pread(BlockDriverState *bs, int64_t offset,
+               void *buf, int count);
+int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
+                const void *buf, int count);
+int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov);
+int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
+    const void *buf, int count);
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov);
+int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov);
+/*
+ * Efficiently zero a region of the disk image.  Note that this is a regular
+ * I/O request like read or write and should have a reasonable size.  This
+ * function is not suitable for zeroing the entire image in a single request
+ * because it may allocate memory for the entire region.
+ */
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors);
+int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, int *pnum);
+int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
+                                            BlockDriverState *base,
+                                            int64_t sector_num,
+                                            int nb_sectors, int *pnum);
+BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
+    const char *backing_file);
+int bdrv_get_backing_file_depth(BlockDriverState *bs);
+int bdrv_truncate(BlockDriverState *bs, int64_t offset);
+int64_t bdrv_getlength(BlockDriverState *bs);
+int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
+void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
+int bdrv_commit(BlockDriverState *bs);
+int bdrv_commit_all(void);
+int bdrv_change_backing_file(BlockDriverState *bs,
+    const char *backing_file, const char *backing_fmt);
+void bdrv_register(BlockDriver *bdrv);
+int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
+                           BlockDriverState *base);
+BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
+                                    BlockDriverState *bs);
+BlockDriverState *bdrv_find_base(BlockDriverState *bs);
+
+
+typedef struct BdrvCheckResult {
+    int corruptions;
+    int leaks;
+    int check_errors;
+    int corruptions_fixed;
+    int leaks_fixed;
+    int64_t image_end_offset;
+    BlockFragInfo bfi;
+} BdrvCheckResult;
+
+typedef enum {
+    BDRV_FIX_LEAKS    = 1,
+    BDRV_FIX_ERRORS   = 2,
+} BdrvCheckMode;
+
+int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix);
+
+/* async block I/O */
+typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector,
+                                     int sector_num);
+BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                                 QEMUIOVector *iov, int nb_sectors,
+                                 BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                                  QEMUIOVector *iov, int nb_sectors,
+                                  BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+                                 BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
+                                   int64_t sector_num, int nb_sectors,
+                                   BlockDriverCompletionFunc *cb, void *opaque);
+void bdrv_aio_cancel(BlockDriverAIOCB *acb);
+
+typedef struct BlockRequest {
+    /* Fields to be filled by multiwrite caller */
+    int64_t sector;
+    int nb_sectors;
+    QEMUIOVector *qiov;
+    BlockDriverCompletionFunc *cb;
+    void *opaque;
+
+    /* Filled by multiwrite implementation */
+    int error;
+} BlockRequest;
+
+int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs,
+    int num_reqs);
+
+/* sg packet commands */
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf);
+BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
+        unsigned long int req, void *buf,
+        BlockDriverCompletionFunc *cb, void *opaque);
+
+/* Invalidate any cached metadata used by image formats */
+void bdrv_invalidate_cache(BlockDriverState *bs);
+void bdrv_invalidate_cache_all(void);
+
+void bdrv_clear_incoming_migration_all(void);
+
+/* Ensure contents are flushed to disk.  */
+int bdrv_flush(BlockDriverState *bs);
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
+int bdrv_flush_all(void);
+void bdrv_close_all(void);
+void bdrv_drain_all(void);
+
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
+int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
+int bdrv_has_zero_init_1(BlockDriverState *bs);
+int bdrv_has_zero_init(BlockDriverState *bs);
+int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+                      int *pnum);
+int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
+                            int64_t sector_num, int nb_sectors, int *pnum);
+
+void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
+                       BlockdevOnError on_write_error);
+BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read);
+BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error);
+void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
+                       bool is_read, int error);
+int bdrv_is_read_only(BlockDriverState *bs);
+int bdrv_is_sg(BlockDriverState *bs);
+int bdrv_enable_write_cache(BlockDriverState *bs);
+void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce);
+int bdrv_is_inserted(BlockDriverState *bs);
+int bdrv_media_changed(BlockDriverState *bs);
+void bdrv_lock_medium(BlockDriverState *bs, bool locked);
+void bdrv_eject(BlockDriverState *bs, bool eject_flag);
+const char *bdrv_get_format_name(BlockDriverState *bs);
+BlockDriverState *bdrv_find(const char *name);
+BlockDriverState *bdrv_next(BlockDriverState *bs);
+void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs),
+                  void *opaque);
+int bdrv_is_encrypted(BlockDriverState *bs);
+int bdrv_key_required(BlockDriverState *bs);
+int bdrv_set_key(BlockDriverState *bs, const char *key);
+int bdrv_query_missing_keys(void);
+void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
+                         void *opaque);
+const char *bdrv_get_device_name(BlockDriverState *bs);
+int bdrv_get_flags(BlockDriverState *bs);
+int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                          const uint8_t *buf, int nb_sectors);
+int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
+void bdrv_round_to_clusters(BlockDriverState *bs,
+                            int64_t sector_num, int nb_sectors,
+                            int64_t *cluster_sector_num,
+                            int *cluster_nb_sectors);
+
+const char *bdrv_get_encrypted_filename(BlockDriverState *bs);
+void bdrv_get_backing_filename(BlockDriverState *bs,
+                               char *filename, int filename_size);
+void bdrv_get_full_backing_filename(BlockDriverState *bs,
+                                    char *dest, size_t sz);
+int bdrv_is_snapshot(BlockDriverState *bs);
+
+int path_is_absolute(const char *path);
+void path_combine(char *dest, int dest_size,
+                  const char *base_path,
+                  const char *filename);
+
+int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+                      int64_t pos, int size);
+
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+                      int64_t pos, int size);
+
+void bdrv_img_create(const char *filename, const char *fmt,
+                     const char *base_filename, const char *base_fmt,
+                     char *options, uint64_t img_size, int flags,
+                     Error **errp, bool quiet);
+
+void bdrv_set_buffer_alignment(BlockDriverState *bs, int align);
+void *qemu_blockalign(BlockDriverState *bs, size_t size);
+bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
+
+struct HBitmapIter;
+void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity);
+int bdrv_get_dirty(BlockDriverState *bs, int64_t sector);
+void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
+void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
+void bdrv_dirty_iter_init(BlockDriverState *bs, struct HBitmapIter *hbi);
+int64_t bdrv_get_dirty_count(BlockDriverState *bs);
+
+void bdrv_enable_copy_on_read(BlockDriverState *bs);
+void bdrv_disable_copy_on_read(BlockDriverState *bs);
+
+void bdrv_set_in_use(BlockDriverState *bs, int in_use);
+int bdrv_in_use(BlockDriverState *bs);
+
+#ifdef CONFIG_LINUX_AIO
+int raw_get_aio_fd(BlockDriverState *bs);
+#else
+static inline int raw_get_aio_fd(BlockDriverState *bs)
+{
+    return -ENOTSUP;
+}
+#endif
+
+enum BlockAcctType {
+    BDRV_ACCT_READ,
+    BDRV_ACCT_WRITE,
+    BDRV_ACCT_FLUSH,
+    BDRV_MAX_IOTYPE,
+};
+
+typedef struct BlockAcctCookie {
+    int64_t bytes;
+    int64_t start_time_ns;
+    enum BlockAcctType type;
+} BlockAcctCookie;
+
+void bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie,
+        int64_t bytes, enum BlockAcctType type);
+void bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie);
+
+typedef enum {
+    BLKDBG_L1_UPDATE,
+
+    BLKDBG_L1_GROW_ALLOC_TABLE,
+    BLKDBG_L1_GROW_WRITE_TABLE,
+    BLKDBG_L1_GROW_ACTIVATE_TABLE,
+
+    BLKDBG_L2_LOAD,
+    BLKDBG_L2_UPDATE,
+    BLKDBG_L2_UPDATE_COMPRESSED,
+    BLKDBG_L2_ALLOC_COW_READ,
+    BLKDBG_L2_ALLOC_WRITE,
+
+    BLKDBG_READ_AIO,
+    BLKDBG_READ_BACKING_AIO,
+    BLKDBG_READ_COMPRESSED,
+
+    BLKDBG_WRITE_AIO,
+    BLKDBG_WRITE_COMPRESSED,
+
+    BLKDBG_VMSTATE_LOAD,
+    BLKDBG_VMSTATE_SAVE,
+
+    BLKDBG_COW_READ,
+    BLKDBG_COW_WRITE,
+
+    BLKDBG_REFTABLE_LOAD,
+    BLKDBG_REFTABLE_GROW,
+
+    BLKDBG_REFBLOCK_LOAD,
+    BLKDBG_REFBLOCK_UPDATE,
+    BLKDBG_REFBLOCK_UPDATE_PART,
+    BLKDBG_REFBLOCK_ALLOC,
+    BLKDBG_REFBLOCK_ALLOC_HOOKUP,
+    BLKDBG_REFBLOCK_ALLOC_WRITE,
+    BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS,
+    BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE,
+    BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE,
+
+    BLKDBG_CLUSTER_ALLOC,
+    BLKDBG_CLUSTER_ALLOC_BYTES,
+    BLKDBG_CLUSTER_FREE,
+
+    BLKDBG_FLUSH_TO_OS,
+    BLKDBG_FLUSH_TO_DISK,
+
+    BLKDBG_EVENT_MAX,
+} BlkDebugEvent;
+
+#define BLKDBG_EVENT(bs, evt) bdrv_debug_event(bs, evt)
+void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event);
+
+int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
+                           const char *tag);
+int bdrv_debug_resume(BlockDriverState *bs, const char *tag);
+bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag);
+
+#endif
diff --git a/contrib/qemu/include/block/block_int.h b/contrib/qemu/include/block/block_int.h
new file mode 100644
index 000000000..c6ac871e2
--- /dev/null
+++ b/contrib/qemu/include/block/block_int.h
@@ -0,0 +1,421 @@
+/*
+ * QEMU System Emulator block driver
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef BLOCK_INT_H
+#define BLOCK_INT_H
+
+#include "block/block.h"
+#include "qemu/option.h"
+#include "qemu/queue.h"
+#include "block/coroutine.h"
+#include "qemu/timer.h"
+#include "qapi-types.h"
+#include "qapi/qmp/qerror.h"
+#include "monitor/monitor.h"
+#include "qemu/hbitmap.h"
+#include "block/snapshot.h"
+
+#define BLOCK_FLAG_ENCRYPT          1
+#define BLOCK_FLAG_COMPAT6          4
+#define BLOCK_FLAG_LAZY_REFCOUNTS   8
+
+#define BLOCK_IO_LIMIT_READ     0
+#define BLOCK_IO_LIMIT_WRITE    1
+#define BLOCK_IO_LIMIT_TOTAL    2
+
+#define BLOCK_IO_SLICE_TIME     100000000
+#define NANOSECONDS_PER_SECOND  1000000000.0
+
+#define BLOCK_OPT_SIZE              "size"
+#define BLOCK_OPT_ENCRYPT           "encryption"
+#define BLOCK_OPT_COMPAT6           "compat6"
+#define BLOCK_OPT_BACKING_FILE      "backing_file"
+#define BLOCK_OPT_BACKING_FMT       "backing_fmt"
+#define BLOCK_OPT_CLUSTER_SIZE      "cluster_size"
+#define BLOCK_OPT_TABLE_SIZE        "table_size"
+#define BLOCK_OPT_PREALLOC          "preallocation"
+#define BLOCK_OPT_SUBFMT            "subformat"
+#define BLOCK_OPT_COMPAT_LEVEL      "compat"
+#define BLOCK_OPT_LAZY_REFCOUNTS    "lazy_refcounts"
+#define BLOCK_OPT_ADAPTER_TYPE      "adapter_type"
+
+typedef struct BdrvTrackedRequest {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    bool is_write;
+    QLIST_ENTRY(BdrvTrackedRequest) list;
+    Coroutine *co; /* owner, used for deadlock detection */
+    CoQueue wait_queue; /* coroutines blocked on this request */
+} BdrvTrackedRequest;
+
+
+typedef struct BlockIOLimit {
+    int64_t bps[3];
+    int64_t iops[3];
+} BlockIOLimit;
+
+typedef struct BlockIOBaseValue {
+    uint64_t bytes[2];
+    uint64_t ios[2];
+} BlockIOBaseValue;
+
+struct BlockDriver {
+    const char *format_name;
+    int instance_size;
+    int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
+    int (*bdrv_probe_device)(const char *filename);
+
+    /* Any driver implementing this callback is expected to be able to handle
+     * NULL file names in its .bdrv_open() implementation */
+    void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp);
+
+    /* For handling image reopen for split or non-split files */
+    int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state,
+                               BlockReopenQueue *queue, Error **errp);
+    void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state);
+    void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state);
+
+    int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags);
+    int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags);
+    int (*bdrv_read)(BlockDriverState *bs, int64_t sector_num,
+                     uint8_t *buf, int nb_sectors);
+    int (*bdrv_write)(BlockDriverState *bs, int64_t sector_num,
+                      const uint8_t *buf, int nb_sectors);
+    void (*bdrv_close)(BlockDriverState *bs);
+    void (*bdrv_rebind)(BlockDriverState *bs);
+    int (*bdrv_create)(const char *filename, QEMUOptionParameter *options);
+    int (*bdrv_set_key)(BlockDriverState *bs, const char *key);
+    int (*bdrv_make_empty)(BlockDriverState *bs);
+    /* aio */
+    BlockDriverAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque);
+    BlockDriverAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque);
+    BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque);
+    BlockDriverAIOCB *(*bdrv_aio_discard)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque);
+
+    int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+    int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+    /*
+     * Efficiently zero a region of the disk image.  Typically an image format
+     * would use a compact metadata representation to implement this.  This
+     * function pointer may be NULL and .bdrv_co_writev() will be called
+     * instead.
+     */
+    int coroutine_fn (*bdrv_co_write_zeroes)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors);
+    int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors);
+    int coroutine_fn (*bdrv_co_is_allocated)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum);
+
+    /*
+     * Invalidate any cached meta-data.
+     */
+    void (*bdrv_invalidate_cache)(BlockDriverState *bs);
+
+    /*
+     * Flushes all data that was already written to the OS all the way down to
+     * the disk (for example raw-posix calls fsync()).
+     */
+    int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs);
+
+    /*
+     * Flushes all internal caches to the OS. The data may still sit in a
+     * writeback cache of the host OS, but it will survive a crash of the qemu
+     * process.
+     */
+    int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs);
+
+    const char *protocol_name;
+    int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset);
+    int64_t (*bdrv_getlength)(BlockDriverState *bs);
+    int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs);
+    int (*bdrv_write_compressed)(BlockDriverState *bs, int64_t sector_num,
+                                 const uint8_t *buf, int nb_sectors);
+
+    int (*bdrv_snapshot_create)(BlockDriverState *bs,
+                                QEMUSnapshotInfo *sn_info);
+    int (*bdrv_snapshot_goto)(BlockDriverState *bs,
+                              const char *snapshot_id);
+    int (*bdrv_snapshot_delete)(BlockDriverState *bs, const char *snapshot_id);
+    int (*bdrv_snapshot_list)(BlockDriverState *bs,
+                              QEMUSnapshotInfo **psn_info);
+    int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs,
+                                  const char *snapshot_name);
+    int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
+
+    int (*bdrv_save_vmstate)(BlockDriverState *bs, QEMUIOVector *qiov,
+                             int64_t pos);
+    int (*bdrv_load_vmstate)(BlockDriverState *bs, uint8_t *buf,
+                             int64_t pos, int size);
+
+    int (*bdrv_change_backing_file)(BlockDriverState *bs,
+        const char *backing_file, const char *backing_fmt);
+
+    /* removable device specific */
+    int (*bdrv_is_inserted)(BlockDriverState *bs);
+    int (*bdrv_media_changed)(BlockDriverState *bs);
+    void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
+    void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
+
+    /* to control generic scsi devices */
+    int (*bdrv_ioctl)(BlockDriverState *bs, unsigned long int req, void *buf);
+    BlockDriverAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs,
+        unsigned long int req, void *buf,
+        BlockDriverCompletionFunc *cb, void *opaque);
+
+    /* List of options for creating images, terminated by name == NULL */
+    QEMUOptionParameter *create_options;
+
+
+    /*
+     * Returns 0 for completed check, -errno for internal errors.
+     * The check results are stored in result.
+     */
+    int (*bdrv_check)(BlockDriverState* bs, BdrvCheckResult *result,
+        BdrvCheckMode fix);
+
+    void (*bdrv_debug_event)(BlockDriverState *bs, BlkDebugEvent event);
+
+    /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */
+    int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event,
+        const char *tag);
+    int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
+    bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);
+
+    /*
+     * Returns 1 if newly created images are guaranteed to contain only
+     * zeros, 0 otherwise.
+     */
+    int (*bdrv_has_zero_init)(BlockDriverState *bs);
+
+    QLIST_ENTRY(BlockDriver) list;
+};
+
+/*
+ * Note: the function bdrv_append() copies and swaps contents of
+ * BlockDriverStates, so if you add new fields to this struct, please
+ * inspect bdrv_append() to determine if the new fields need to be
+ * copied as well.
+ */
+struct BlockDriverState {
+    int64_t total_sectors; /* if we are reading a disk image, give its
+                              size in sectors */
+    int read_only; /* if true, the media is read only */
+    int open_flags; /* flags used to open the file, re-used for re-open */
+    int encrypted; /* if true, the media is encrypted */
+    int valid_key; /* if true, a valid encryption key has been set */
+    int sg;        /* if true, the device is a /dev/sg* */
+    int copy_on_read; /* if true, copy read backing sectors into image
+                         note this is a reference count */
+
+    BlockDriver *drv; /* NULL means no media */
+    void *opaque;
+
+    void *dev;                  /* attached device model, if any */
+    /* TODO change to DeviceState when all users are qdevified */
+    const BlockDevOps *dev_ops;
+    void *dev_opaque;
+
+    char filename[1024];
+    char backing_file[1024]; /* if non zero, the image is a diff of
+                                this file image */
+    char backing_format[16]; /* if non-zero and backing_file exists */
+    int is_temporary;
+
+    BlockDriverState *backing_hd;
+    BlockDriverState *file;
+
+    NotifierList close_notifiers;
+
+    /* Callback before write request is processed */
+    NotifierWithReturnList before_write_notifiers;
+
+    /* number of in-flight copy-on-read requests */
+    unsigned int copy_on_read_in_flight;
+
+    /* the time for latest disk I/O */
+    int64_t slice_start;
+    int64_t slice_end;
+    BlockIOLimit io_limits;
+    BlockIOBaseValue slice_submitted;
+    CoQueue      throttled_reqs;
+    QEMUTimer    *block_timer;
+    bool         io_limits_enabled;
+
+    /* I/O stats (display with "info blockstats"). */
+    uint64_t nr_bytes[BDRV_MAX_IOTYPE];
+    uint64_t nr_ops[BDRV_MAX_IOTYPE];
+    uint64_t total_time_ns[BDRV_MAX_IOTYPE];
+    uint64_t wr_highest_sector;
+
+    /* Whether the disk can expand beyond total_sectors */
+    int growable;
+
+    /* the memory alignment required for the buffers handled by this driver */
+    int buffer_alignment;
+
+    /* do we need to tell the quest if we have a volatile write cache? */
+    int enable_write_cache;
+
+    /* NOTE: the following infos are only hints for real hardware
+       drivers. They are not used by the block driver */
+    BlockdevOnError on_read_error, on_write_error;
+    bool iostatus_enabled;
+    BlockDeviceIoStatus iostatus;
+    char device_name[32];
+    HBitmap *dirty_bitmap;
+    int in_use; /* users other than guest access, eg. block migration */
+    QTAILQ_ENTRY(BlockDriverState) list;
+
+    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
+
+    /* long-running background operation */
+    BlockJob *job;
+
+    QDict *options;
+};
+
+int get_tmp_filename(char *filename, int size);
+
+void bdrv_set_io_limits(BlockDriverState *bs,
+                        BlockIOLimit *io_limits);
+
+/**
+ * bdrv_add_before_write_notifier:
+ *
+ * Register a callback that is invoked before write requests are processed but
+ * after any throttling or waiting for overlapping requests.
+ */
+void bdrv_add_before_write_notifier(BlockDriverState *bs,
+                                    NotifierWithReturn *notifier);
+
+/**
+ * bdrv_get_aio_context:
+ *
+ * Returns: the currently bound #AioContext
+ */
+AioContext *bdrv_get_aio_context(BlockDriverState *bs);
+
+#ifdef _WIN32
+int is_windows_drive(const char *filename);
+#endif
+void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
+                               enum MonitorEvent ev,
+                               BlockErrorAction action, bool is_read);
+
+/**
+ * stream_start:
+ * @bs: Block device to operate on.
+ * @base: Block device that will become the new base, or %NULL to
+ * flatten the whole backing file chain onto @bs.
+ * @base_id: The file name that will be written to @bs as the new
+ * backing file if the job completes.  Ignored if @base is %NULL.
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_error: The action to take upon error.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @errp: Error object.
+ *
+ * Start a streaming operation on @bs.  Clusters that are unallocated
+ * in @bs, but allocated in any image between @base and @bs (both
+ * exclusive) will be written to @bs.  At the end of a successful
+ * streaming job, the backing file of @bs will be changed to
+ * @base_id in the written image and to @base in the live BlockDriverState.
+ */
+void stream_start(BlockDriverState *bs, BlockDriverState *base,
+                  const char *base_id, int64_t speed, BlockdevOnError on_error,
+                  BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp);
+
+/**
+ * commit_start:
+ * @bs: Top Block device
+ * @base: Block device that will be written into, and become the new top
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_error: The action to take upon error.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @errp: Error object.
+ *
+ */
+void commit_start(BlockDriverState *bs, BlockDriverState *base,
+                 BlockDriverState *top, int64_t speed,
+                 BlockdevOnError on_error, BlockDriverCompletionFunc *cb,
+                 void *opaque, Error **errp);
+
+/*
+ * mirror_start:
+ * @bs: Block device to operate on.
+ * @target: Block device to write to.
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @granularity: The chosen granularity for the dirty bitmap.
+ * @buf_size: The amount of data that can be in flight at one time.
+ * @mode: Whether to collapse all images in the chain to the target.
+ * @on_source_error: The action to take upon error reading from the source.
+ * @on_target_error: The action to take upon error writing to the target.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @errp: Error object.
+ *
+ * Start a mirroring operation on @bs.  Clusters that are allocated
+ * in @bs will be written to @bs until the job is cancelled or
+ * manually completed.  At the end of a successful mirroring job,
+ * @bs will be switched to read from @target.
+ */
+void mirror_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  MirrorSyncMode mode, BlockdevOnError on_source_error,
+                  BlockdevOnError on_target_error,
+                  BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp);
+
+/*
+ * backup_start:
+ * @bs: Block device to operate on.
+ * @target: Block device to write to.
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_source_error: The action to take upon error reading from the source.
+ * @on_target_error: The action to take upon error writing to the target.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ *
+ * Start a backup operation on @bs.  Clusters in @bs are written to @target
+ * until the job is cancelled or manually completed.
+ */
+void backup_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed, BlockdevOnError on_source_error,
+                  BlockdevOnError on_target_error,
+                  BlockDriverCompletionFunc *cb, void *opaque,
+                  Error **errp);
+
+#endif /* BLOCK_INT_H */
diff --git a/contrib/qemu/include/block/blockjob.h b/contrib/qemu/include/block/blockjob.h
new file mode 100644
index 000000000..c290d07bb
--- /dev/null
+++ b/contrib/qemu/include/block/blockjob.h
@@ -0,0 +1,278 @@
+/*
+ * Declarations for long-running block device operations
+ *
+ * Copyright (c) 2011 IBM Corp.
+ * Copyright (c) 2012 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef BLOCKJOB_H
+#define BLOCKJOB_H 1
+
+#include "block/block.h"
+
+/**
+ * BlockJobType:
+ *
+ * A class type for block job objects.
+ */
+typedef struct BlockJobType {
+    /** Derived BlockJob struct size */
+    size_t instance_size;
+
+    /** String describing the operation, part of query-block-jobs QMP API */
+    const char *job_type;
+
+    /** Optional callback for job types that support setting a speed limit */
+    void (*set_speed)(BlockJob *job, int64_t speed, Error **errp);
+
+    /** Optional callback for job types that need to forward I/O status reset */
+    void (*iostatus_reset)(BlockJob *job);
+
+    /**
+     * Optional callback for job types whose completion must be triggered
+     * manually.
+     */
+    void (*complete)(BlockJob *job, Error **errp);
+} BlockJobType;
+
+/**
+ * BlockJob:
+ *
+ * Long-running operation on a BlockDriverState.
+ */
+struct BlockJob {
+    /** The job type, including the job vtable.  */
+    const BlockJobType *job_type;
+
+    /** The block device on which the job is operating.  */
+    BlockDriverState *bs;
+
+    /**
+     * The coroutine that executes the job.  If not NULL, it is
+     * reentered when busy is false and the job is cancelled.
+     */
+    Coroutine *co;
+
+    /**
+     * Set to true if the job should cancel itself.  The flag must
+     * always be tested just before toggling the busy flag from false
+     * to true.  After a job has been cancelled, it should only yield
+     * if #qemu_aio_wait will ("sooner or later") reenter the coroutine.
+     */
+    bool cancelled;
+
+    /**
+     * Set to true if the job is either paused, or will pause itself
+     * as soon as possible (if busy == true).
+     */
+    bool paused;
+
+    /**
+     * Set to false by the job while it is in a quiescent state, where
+     * no I/O is pending and the job has yielded on any condition
+     * that is not detected by #qemu_aio_wait, such as a timer.
+     */
+    bool busy;
+
+    /** Status that is published by the query-block-jobs QMP API */
+    BlockDeviceIoStatus iostatus;
+
+    /** Offset that is published by the query-block-jobs QMP API */
+    int64_t offset;
+
+    /** Length that is published by the query-block-jobs QMP API */
+    int64_t len;
+
+    /** Speed that was set with @block_job_set_speed.  */
+    int64_t speed;
+
+    /** The completion function that will be called when the job completes.  */
+    BlockDriverCompletionFunc *cb;
+
+    /** The opaque value that is passed to the completion function.  */
+    void *opaque;
+};
+
+/**
+ * block_job_create:
+ * @job_type: The class object for the newly-created job.
+ * @bs: The block
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @errp: Error object.
+ *
+ * Create a new long-running block device job and return it.  The job
+ * will call @cb asynchronously when the job completes.  Note that
+ * @bs may have been closed at the time the @cb it is called.  If
+ * this is the case, the job may be reported as either cancelled or
+ * completed.
+ *
+ * This function is not part of the public job interface; it should be
+ * called from a wrapper that is specific to the job type.
+ */
+void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
+                       int64_t speed, BlockDriverCompletionFunc *cb,
+                       void *opaque, Error **errp);
+
+/**
+ * block_job_sleep_ns:
+ * @job: The job that calls the function.
+ * @clock: The clock to sleep on.
+ * @ns: How many nanoseconds to stop for.
+ *
+ * Put the job to sleep (assuming that it wasn't canceled) for @ns
+ * nanoseconds.  Canceling the job will interrupt the wait immediately.
+ */
+void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns);
+
+/**
+ * block_job_completed:
+ * @job: The job being completed.
+ * @ret: The status code.
+ *
+ * Call the completion function that was registered at creation time, and
+ * free @job.
+ */
+void block_job_completed(BlockJob *job, int ret);
+
+/**
+ * block_job_set_speed:
+ * @job: The job to set the speed for.
+ * @speed: The new value
+ * @errp: Error object.
+ *
+ * Set a rate-limiting parameter for the job; the actual meaning may
+ * vary depending on the job type.
+ */
+void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp);
+
+/**
+ * block_job_cancel:
+ * @job: The job to be canceled.
+ *
+ * Asynchronously cancel the specified job.
+ */
+void block_job_cancel(BlockJob *job);
+
+/**
+ * block_job_complete:
+ * @job: The job to be completed.
+ * @errp: Error object.
+ *
+ * Asynchronously complete the specified job.
+ */
+void block_job_complete(BlockJob *job, Error **errp);
+
+/**
+ * block_job_is_cancelled:
+ * @job: The job being queried.
+ *
+ * Returns whether the job is scheduled for cancellation.
+ */
+bool block_job_is_cancelled(BlockJob *job);
+
+/**
+ * block_job_query:
+ * @job: The job to get information about.
+ *
+ * Return information about a job.
+ */
+BlockJobInfo *block_job_query(BlockJob *job);
+
+/**
+ * block_job_pause:
+ * @job: The job to be paused.
+ *
+ * Asynchronously pause the specified job.
+ */
+void block_job_pause(BlockJob *job);
+
+/**
+ * block_job_resume:
+ * @job: The job to be resumed.
+ *
+ * Resume the specified job.
+ */
+void block_job_resume(BlockJob *job);
+
+/**
+ * qobject_from_block_job:
+ * @job: The job whose information is requested.
+ *
+ * Return a QDict corresponding to @job's query-block-jobs entry.
+ */
+QObject *qobject_from_block_job(BlockJob *job);
+
+/**
+ * block_job_ready:
+ * @job: The job which is now ready to complete.
+ *
+ * Send a BLOCK_JOB_READY event for the specified job.
+ */
+void block_job_ready(BlockJob *job);
+
+/**
+ * block_job_is_paused:
+ * @job: The job being queried.
+ *
+ * Returns whether the job is currently paused, or will pause
+ * as soon as it reaches a sleeping point.
+ */
+bool block_job_is_paused(BlockJob *job);
+
+/**
+ * block_job_cancel_sync:
+ * @job: The job to be canceled.
+ *
+ * Synchronously cancel the job.  The completion callback is called
+ * before the function returns.  The job may actually complete
+ * instead of canceling itself; the circumstances under which this
+ * happens depend on the kind of job that is active.
+ *
+ * Returns the return value from the job if the job actually completed
+ * during the call, or -ECANCELED if it was canceled.
+ */
+int block_job_cancel_sync(BlockJob *job);
+
+/**
+ * block_job_iostatus_reset:
+ * @job: The job whose I/O status should be reset.
+ *
+ * Reset I/O status on @job and on BlockDriverState objects it uses,
+ * other than job->bs.
+ */
+void block_job_iostatus_reset(BlockJob *job);
+
+/**
+ * block_job_error_action:
+ * @job: The job to signal an error for.
+ * @bs: The block device on which to set an I/O error.
+ * @on_err: The error action setting.
+ * @is_read: Whether the operation was a read.
+ * @error: The error that was reported.
+ *
+ * Report an I/O error for a block job and possibly stop the VM.  Return the
+ * action that was selected based on @on_err and @error.
+ */
+BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
+                                        BlockdevOnError on_err,
+                                        int is_read, int error);
+#endif
diff --git a/contrib/qemu/include/block/coroutine.h b/contrib/qemu/include/block/coroutine.h
new file mode 100644
index 000000000..377805a3b
--- /dev/null
+++ b/contrib/qemu/include/block/coroutine.h
@@ -0,0 +1,218 @@
+/*
+ * QEMU coroutine implementation
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi    <stefanha@linux.vnet.ibm.com>
+ *  Kevin Wolf         <kwolf@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_COROUTINE_H
+#define QEMU_COROUTINE_H
+
+#include <stdbool.h>
+#include "qemu/queue.h"
+#include "qemu/timer.h"
+
+/**
+ * Coroutines are a mechanism for stack switching and can be used for
+ * cooperative userspace threading.  These functions provide a simple but
+ * useful flavor of coroutines that is suitable for writing sequential code,
+ * rather than callbacks, for operations that need to give up control while
+ * waiting for events to complete.
+ *
+ * These functions are re-entrant and may be used outside the global mutex.
+ */
+
+/**
+ * Mark a function that executes in coroutine context
+ *
+ * Functions that execute in coroutine context cannot be called directly from
+ * normal functions.  In the future it would be nice to enable compiler or
+ * static checker support for catching such errors.  This annotation might make
+ * it possible and in the meantime it serves as documentation.
+ *
+ * For example:
+ *
+ *   static void coroutine_fn foo(void) {
+ *       ....
+ *   }
+ */
+#define coroutine_fn
+
+typedef struct Coroutine Coroutine;
+
+/**
+ * Coroutine entry point
+ *
+ * When the coroutine is entered for the first time, opaque is passed in as an
+ * argument.
+ *
+ * When this function returns, the coroutine is destroyed automatically and
+ * execution continues in the caller who last entered the coroutine.
+ */
+typedef void coroutine_fn CoroutineEntry(void *opaque);
+
+/**
+ * Create a new coroutine
+ *
+ * Use qemu_coroutine_enter() to actually transfer control to the coroutine.
+ */
+Coroutine *qemu_coroutine_create(CoroutineEntry *entry);
+
+/**
+ * Transfer control to a coroutine
+ *
+ * The opaque argument is passed as the argument to the entry point when
+ * entering the coroutine for the first time.  It is subsequently ignored.
+ */
+void qemu_coroutine_enter(Coroutine *coroutine, void *opaque);
+
+/**
+ * Transfer control back to a coroutine's caller
+ *
+ * This function does not return until the coroutine is re-entered using
+ * qemu_coroutine_enter().
+ */
+void coroutine_fn qemu_coroutine_yield(void);
+
+/**
+ * Get the currently executing coroutine
+ */
+Coroutine *coroutine_fn qemu_coroutine_self(void);
+
+/**
+ * Return whether or not currently inside a coroutine
+ *
+ * This can be used to write functions that work both when in coroutine context
+ * and when not in coroutine context.  Note that such functions cannot use the
+ * coroutine_fn annotation since they work outside coroutine context.
+ */
+bool qemu_in_coroutine(void);
+
+
+
+/**
+ * CoQueues are a mechanism to queue coroutines in order to continue executing
+ * them later. They provide the fundamental primitives on which coroutine locks
+ * are built.
+ */
+typedef struct CoQueue {
+    QTAILQ_HEAD(, Coroutine) entries;
+    AioContext *ctx;
+} CoQueue;
+
+/**
+ * Initialise a CoQueue. This must be called before any other operation is used
+ * on the CoQueue.
+ */
+void qemu_co_queue_init(CoQueue *queue);
+
+/**
+ * Adds the current coroutine to the CoQueue and transfers control to the
+ * caller of the coroutine.
+ */
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
+
+/**
+ * Adds the current coroutine to the head of the CoQueue and transfers control to the
+ * caller of the coroutine.
+ */
+void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue);
+
+/**
+ * Restarts the next coroutine in the CoQueue and removes it from the queue.
+ *
+ * Returns true if a coroutine was restarted, false if the queue is empty.
+ */
+bool qemu_co_queue_next(CoQueue *queue);
+
+/**
+ * Restarts all coroutines in the CoQueue and leaves the queue empty.
+ */
+void qemu_co_queue_restart_all(CoQueue *queue);
+
+/**
+ * Checks if the CoQueue is empty.
+ */
+bool qemu_co_queue_empty(CoQueue *queue);
+
+
+/**
+ * Provides a mutex that can be used to synchronise coroutines
+ */
+typedef struct CoMutex {
+    bool locked;
+    CoQueue queue;
+} CoMutex;
+
+/**
+ * Initialises a CoMutex. This must be called before any other operation is used
+ * on the CoMutex.
+ */
+void qemu_co_mutex_init(CoMutex *mutex);
+
+/**
+ * Locks the mutex. If the lock cannot be taken immediately, control is
+ * transferred to the caller of the current coroutine.
+ */
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
+
+/**
+ * Unlocks the mutex and schedules the next coroutine that was waiting for this
+ * lock to be run.
+ */
+void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
+
+typedef struct CoRwlock {
+    bool writer;
+    int reader;
+    CoQueue queue;
+} CoRwlock;
+
+/**
+ * Initialises a CoRwlock. This must be called before any other operation
+ * is used on the CoRwlock
+ */
+void qemu_co_rwlock_init(CoRwlock *lock);
+
+/**
+ * Read locks the CoRwlock. If the lock cannot be taken immediately because
+ * of a parallel writer, control is transferred to the caller of the current
+ * coroutine.
+ */
+void qemu_co_rwlock_rdlock(CoRwlock *lock);
+
+/**
+ * Write Locks the mutex. If the lock cannot be taken immediately because
+ * of a parallel reader, control is transferred to the caller of the current
+ * coroutine.
+ */
+void qemu_co_rwlock_wrlock(CoRwlock *lock);
+
+/**
+ * Unlocks the read/write lock and schedules the next coroutine that was
+ * waiting for this lock to be run.
+ */
+void qemu_co_rwlock_unlock(CoRwlock *lock);
+
+/**
+ * Yield the coroutine for a given duration
+ *
+ * Note this function uses timers and hence only works when a main loop is in
+ * use.  See main-loop.h and do not use from qemu-tool programs.
+ */
+void coroutine_fn co_sleep_ns(QEMUClock *clock, int64_t ns);
+
+/**
+ * Yield until a file descriptor becomes readable
+ *
+ * Note that this function clobbers the handlers for the file descriptor.
+ */
+void coroutine_fn yield_until_fd_readable(int fd);
+#endif /* QEMU_COROUTINE_H */
diff --git a/contrib/qemu/include/block/coroutine_int.h b/contrib/qemu/include/block/coroutine_int.h
new file mode 100644
index 000000000..f133d65af
--- /dev/null
+++ b/contrib/qemu/include/block/coroutine_int.h
@@ -0,0 +1,53 @@
+/*
+ * Coroutine internals
+ *
+ * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_COROUTINE_INT_H
+#define QEMU_COROUTINE_INT_H
+
+#include "qemu/queue.h"
+#include "block/coroutine.h"
+
+typedef enum {
+    COROUTINE_YIELD = 1,
+    COROUTINE_TERMINATE = 2,
+} CoroutineAction;
+
+struct Coroutine {
+    CoroutineEntry *entry;
+    void *entry_arg;
+    Coroutine *caller;
+    QSLIST_ENTRY(Coroutine) pool_next;
+
+    /* Coroutines that should be woken up when we yield or terminate */
+    QTAILQ_HEAD(, Coroutine) co_queue_wakeup;
+    QTAILQ_ENTRY(Coroutine) co_queue_next;
+};
+
+Coroutine *qemu_coroutine_new(void);
+void qemu_coroutine_delete(Coroutine *co);
+CoroutineAction qemu_coroutine_switch(Coroutine *from, Coroutine *to,
+                                      CoroutineAction action);
+void coroutine_fn qemu_co_queue_run_restart(Coroutine *co);
+
+#endif
diff --git a/contrib/qemu/include/block/snapshot.h b/contrib/qemu/include/block/snapshot.h
new file mode 100644
index 000000000..eaf61f032
--- /dev/null
+++ b/contrib/qemu/include/block/snapshot.h
@@ -0,0 +1,53 @@
+/*
+ * Block layer snapshot related functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef SNAPSHOT_H
+#define SNAPSHOT_H
+
+#include "qemu-common.h"
+
+typedef struct QEMUSnapshotInfo {
+    char id_str[128]; /* unique snapshot id */
+    /* the following fields are informative. They are not needed for
+       the consistency of the snapshot */
+    char name[256]; /* user chosen name */
+    uint64_t vm_state_size; /* VM state info size */
+    uint32_t date_sec; /* UTC date of the snapshot */
+    uint32_t date_nsec;
+    uint64_t vm_clock_nsec; /* VM clock relative to boot */
+} QEMUSnapshotInfo;
+
+int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info,
+                       const char *name);
+int bdrv_can_snapshot(BlockDriverState *bs);
+int bdrv_snapshot_create(BlockDriverState *bs,
+                         QEMUSnapshotInfo *sn_info);
+int bdrv_snapshot_goto(BlockDriverState *bs,
+                       const char *snapshot_id);
+int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id);
+int bdrv_snapshot_list(BlockDriverState *bs,
+                       QEMUSnapshotInfo **psn_info);
+int bdrv_snapshot_load_tmp(BlockDriverState *bs,
+                           const char *snapshot_name);
+#endif
diff --git a/contrib/qemu/include/config.h b/contrib/qemu/include/config.h
new file mode 100644
index 000000000..e20f78696
--- /dev/null
+++ b/contrib/qemu/include/config.h
@@ -0,0 +1,2 @@
+#include "config-host.h"
+#include "config-target.h"
diff --git a/contrib/qemu/include/exec/cpu-common.h b/contrib/qemu/include/exec/cpu-common.h
new file mode 100644
index 000000000..e4996e19c
--- /dev/null
+++ b/contrib/qemu/include/exec/cpu-common.h
@@ -0,0 +1,124 @@
+#ifndef CPU_COMMON_H
+#define CPU_COMMON_H 1
+
+/* CPU interfaces that are target independent.  */
+
+#ifndef CONFIG_USER_ONLY
+#include "exec/hwaddr.h"
+#endif
+
+#ifndef NEED_CPU_H
+#include "exec/poison.h"
+#endif
+
+#include "qemu/bswap.h"
+#include "qemu/queue.h"
+
+/**
+ * CPUListState:
+ * @cpu_fprintf: Print function.
+ * @file: File to print to using @cpu_fprint.
+ *
+ * State commonly used for iterating over CPU models.
+ */
+typedef struct CPUListState {
+    fprintf_function cpu_fprintf;
+    FILE *file;
+} CPUListState;
+
+#if !defined(CONFIG_USER_ONLY)
+
+enum device_endian {
+    DEVICE_NATIVE_ENDIAN,
+    DEVICE_BIG_ENDIAN,
+    DEVICE_LITTLE_ENDIAN,
+};
+
+/* address in the RAM (different from a physical address) */
+#if defined(CONFIG_XEN_BACKEND)
+typedef uint64_t ram_addr_t;
+#  define RAM_ADDR_MAX UINT64_MAX
+#  define RAM_ADDR_FMT "%" PRIx64
+#else
+typedef uintptr_t ram_addr_t;
+#  define RAM_ADDR_MAX UINTPTR_MAX
+#  define RAM_ADDR_FMT "%" PRIxPTR
+#endif
+
+/* memory API */
+
+typedef void CPUWriteMemoryFunc(void *opaque, hwaddr addr, uint32_t value);
+typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr);
+
+void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
+/* This should not be used by devices.  */
+MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
+void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
+
+void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
+                            int len, int is_write);
+static inline void cpu_physical_memory_read(hwaddr addr,
+                                            void *buf, int len)
+{
+    cpu_physical_memory_rw(addr, buf, len, 0);
+}
+static inline void cpu_physical_memory_write(hwaddr addr,
+                                             const void *buf, int len)
+{
+    cpu_physical_memory_rw(addr, (void *)buf, len, 1);
+}
+void *cpu_physical_memory_map(hwaddr addr,
+                              hwaddr *plen,
+                              int is_write);
+void cpu_physical_memory_unmap(void *buffer, hwaddr len,
+                               int is_write, hwaddr access_len);
+void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque));
+
+bool cpu_physical_memory_is_io(hwaddr phys_addr);
+
+/* Coalesced MMIO regions are areas where write operations can be reordered.
+ * This usually implies that write operations are side-effect free.  This allows
+ * batching which can make a major impact on performance when using
+ * virtualization.
+ */
+void qemu_flush_coalesced_mmio_buffer(void);
+
+uint32_t ldub_phys(hwaddr addr);
+uint32_t lduw_le_phys(hwaddr addr);
+uint32_t lduw_be_phys(hwaddr addr);
+uint32_t ldl_le_phys(hwaddr addr);
+uint32_t ldl_be_phys(hwaddr addr);
+uint64_t ldq_le_phys(hwaddr addr);
+uint64_t ldq_be_phys(hwaddr addr);
+void stb_phys(hwaddr addr, uint32_t val);
+void stw_le_phys(hwaddr addr, uint32_t val);
+void stw_be_phys(hwaddr addr, uint32_t val);
+void stl_le_phys(hwaddr addr, uint32_t val);
+void stl_be_phys(hwaddr addr, uint32_t val);
+void stq_le_phys(hwaddr addr, uint64_t val);
+void stq_be_phys(hwaddr addr, uint64_t val);
+
+#ifdef NEED_CPU_H
+uint32_t lduw_phys(hwaddr addr);
+uint32_t ldl_phys(hwaddr addr);
+uint64_t ldq_phys(hwaddr addr);
+void stl_phys_notdirty(hwaddr addr, uint32_t val);
+void stw_phys(hwaddr addr, uint32_t val);
+void stl_phys(hwaddr addr, uint32_t val);
+void stq_phys(hwaddr addr, uint64_t val);
+#endif
+
+void cpu_physical_memory_write_rom(hwaddr addr,
+                                   const uint8_t *buf, int len);
+
+extern struct MemoryRegion io_mem_rom;
+extern struct MemoryRegion io_mem_notdirty;
+
+typedef void (RAMBlockIterFunc)(void *host_addr,
+    ram_addr_t offset, ram_addr_t length, void *opaque);
+
+void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
+
+#endif
+
+#endif /* !CPU_COMMON_H */
diff --git a/contrib/qemu/include/exec/hwaddr.h b/contrib/qemu/include/exec/hwaddr.h
new file mode 100644
index 000000000..c9eb78fba
--- /dev/null
+++ b/contrib/qemu/include/exec/hwaddr.h
@@ -0,0 +1,20 @@
+/* Define hwaddr if it exists.  */
+
+#ifndef HWADDR_H
+#define HWADDR_H
+
+#define HWADDR_BITS 64
+/* hwaddr is the type of a physical address (its size can
+   be different from 'target_ulong').  */
+
+typedef uint64_t hwaddr;
+#define HWADDR_MAX UINT64_MAX
+#define TARGET_FMT_plx "%016" PRIx64
+#define HWADDR_PRId PRId64
+#define HWADDR_PRIi PRIi64
+#define HWADDR_PRIo PRIo64
+#define HWADDR_PRIu PRIu64
+#define HWADDR_PRIx PRIx64
+#define HWADDR_PRIX PRIX64
+
+#endif
diff --git a/contrib/qemu/include/exec/poison.h b/contrib/qemu/include/exec/poison.h
new file mode 100644
index 000000000..2341a7504
--- /dev/null
+++ b/contrib/qemu/include/exec/poison.h
@@ -0,0 +1,63 @@
+/* Poison identifiers that should not be used when building
+   target independent device code.  */
+
+#ifndef HW_POISON_H
+#define HW_POISON_H
+#ifdef __GNUC__
+
+#pragma GCC poison TARGET_I386
+#pragma GCC poison TARGET_X86_64
+#pragma GCC poison TARGET_ALPHA
+#pragma GCC poison TARGET_ARM
+#pragma GCC poison TARGET_CRIS
+#pragma GCC poison TARGET_LM32
+#pragma GCC poison TARGET_M68K
+#pragma GCC poison TARGET_MIPS
+#pragma GCC poison TARGET_MIPS64
+#pragma GCC poison TARGET_OPENRISC
+#pragma GCC poison TARGET_PPC
+#pragma GCC poison TARGET_PPCEMB
+#pragma GCC poison TARGET_PPC64
+#pragma GCC poison TARGET_ABI32
+#pragma GCC poison TARGET_SH4
+#pragma GCC poison TARGET_SPARC
+#pragma GCC poison TARGET_SPARC64
+
+#pragma GCC poison TARGET_WORDS_BIGENDIAN
+#pragma GCC poison BSWAP_NEEDED
+
+#pragma GCC poison TARGET_LONG_BITS
+#pragma GCC poison TARGET_FMT_lx
+#pragma GCC poison TARGET_FMT_ld
+
+#pragma GCC poison TARGET_PAGE_SIZE
+#pragma GCC poison TARGET_PAGE_MASK
+#pragma GCC poison TARGET_PAGE_BITS
+#pragma GCC poison TARGET_PAGE_ALIGN
+
+#pragma GCC poison CPUArchState
+#pragma GCC poison env
+
+#pragma GCC poison lduw_phys
+#pragma GCC poison ldl_phys
+#pragma GCC poison ldq_phys
+#pragma GCC poison stl_phys_notdirty
+#pragma GCC poison stw_phys
+#pragma GCC poison stl_phys
+#pragma GCC poison stq_phys
+
+#pragma GCC poison CPU_INTERRUPT_HARD
+#pragma GCC poison CPU_INTERRUPT_EXITTB
+#pragma GCC poison CPU_INTERRUPT_HALT
+#pragma GCC poison CPU_INTERRUPT_DEBUG
+#pragma GCC poison CPU_INTERRUPT_TGT_EXT_0
+#pragma GCC poison CPU_INTERRUPT_TGT_EXT_1
+#pragma GCC poison CPU_INTERRUPT_TGT_EXT_2
+#pragma GCC poison CPU_INTERRUPT_TGT_EXT_3
+#pragma GCC poison CPU_INTERRUPT_TGT_EXT_4
+#pragma GCC poison CPU_INTERRUPT_TGT_INT_0
+#pragma GCC poison CPU_INTERRUPT_TGT_INT_1
+#pragma GCC poison CPU_INTERRUPT_TGT_INT_2
+
+#endif
+#endif
diff --git a/contrib/qemu/include/fpu/softfloat.h b/contrib/qemu/include/fpu/softfloat.h
new file mode 100644
index 000000000..f3927e241
--- /dev/null
+++ b/contrib/qemu/include/fpu/softfloat.h
@@ -0,0 +1,641 @@
+/*
+ * QEMU float support
+ *
+ * Derived from SoftFloat.
+ */
+
+/*============================================================================
+
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+
+=============================================================================*/
+
+#ifndef SOFTFLOAT_H
+#define SOFTFLOAT_H
+
+#if defined(CONFIG_SOLARIS) && defined(CONFIG_NEEDS_LIBSUNMATH)
+#include <sunmath.h>
+#endif
+
+#include <inttypes.h>
+#include "config-host.h"
+#include "qemu/osdep.h"
+
+/*----------------------------------------------------------------------------
+| Each of the following `typedef's defines the most convenient type that holds
+| integers of at least as many bits as specified.  For example, `uint8' should
+| be the most convenient type that can hold unsigned integers of as many as
+| 8 bits.  The `flag' type must be able to hold either a 0 or 1.  For most
+| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed
+| to the same as `int'.
+*----------------------------------------------------------------------------*/
+typedef uint8_t flag;
+typedef uint8_t uint8;
+typedef int8_t int8;
+typedef unsigned int uint32;
+typedef signed int int32;
+typedef uint64_t uint64;
+typedef int64_t int64;
+
+#define LIT64( a ) a##LL
+#define INLINE static inline
+
+#define STATUS_PARAM , float_status *status
+#define STATUS(field) status->field
+#define STATUS_VAR , status
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point ordering relations
+*----------------------------------------------------------------------------*/
+enum {
+    float_relation_less      = -1,
+    float_relation_equal     =  0,
+    float_relation_greater   =  1,
+    float_relation_unordered =  2
+};
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point types.
+*----------------------------------------------------------------------------*/
+/* Use structures for soft-float types.  This prevents accidentally mixing
+   them with native int/float types.  A sufficiently clever compiler and
+   sane ABI should be able to see though these structs.  However
+   x86/gcc 3.x seems to struggle a bit, so leave them disabled by default.  */
+//#define USE_SOFTFLOAT_STRUCT_TYPES
+#ifdef USE_SOFTFLOAT_STRUCT_TYPES
+typedef struct {
+    uint16_t v;
+} float16;
+#define float16_val(x) (((float16)(x)).v)
+#define make_float16(x) __extension__ ({ float16 f16_val = {x}; f16_val; })
+#define const_float16(x) { x }
+typedef struct {
+    uint32_t v;
+} float32;
+/* The cast ensures an error if the wrong type is passed.  */
+#define float32_val(x) (((float32)(x)).v)
+#define make_float32(x) __extension__ ({ float32 f32_val = {x}; f32_val; })
+#define const_float32(x) { x }
+typedef struct {
+    uint64_t v;
+} float64;
+#define float64_val(x) (((float64)(x)).v)
+#define make_float64(x) __extension__ ({ float64 f64_val = {x}; f64_val; })
+#define const_float64(x) { x }
+#else
+typedef uint16_t float16;
+typedef uint32_t float32;
+typedef uint64_t float64;
+#define float16_val(x) (x)
+#define float32_val(x) (x)
+#define float64_val(x) (x)
+#define make_float16(x) (x)
+#define make_float32(x) (x)
+#define make_float64(x) (x)
+#define const_float16(x) (x)
+#define const_float32(x) (x)
+#define const_float64(x) (x)
+#endif
+typedef struct {
+    uint64_t low;
+    uint16_t high;
+} floatx80;
+#define make_floatx80(exp, mant) ((floatx80) { mant, exp })
+#define make_floatx80_init(exp, mant) { .low = mant, .high = exp }
+typedef struct {
+#ifdef HOST_WORDS_BIGENDIAN
+    uint64_t high, low;
+#else
+    uint64_t low, high;
+#endif
+} float128;
+#define make_float128(high_, low_) ((float128) { .high = high_, .low = low_ })
+#define make_float128_init(high_, low_) { .high = high_, .low = low_ }
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point underflow tininess-detection mode.
+*----------------------------------------------------------------------------*/
+enum {
+    float_tininess_after_rounding  = 0,
+    float_tininess_before_rounding = 1
+};
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point rounding mode.
+*----------------------------------------------------------------------------*/
+enum {
+    float_round_nearest_even = 0,
+    float_round_down         = 1,
+    float_round_up           = 2,
+    float_round_to_zero      = 3
+};
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point exception flags.
+*----------------------------------------------------------------------------*/
+enum {
+    float_flag_invalid   =  1,
+    float_flag_divbyzero =  4,
+    float_flag_overflow  =  8,
+    float_flag_underflow = 16,
+    float_flag_inexact   = 32,
+    float_flag_input_denormal = 64,
+    float_flag_output_denormal = 128
+};
+
+typedef struct float_status {
+    signed char float_detect_tininess;
+    signed char float_rounding_mode;
+    signed char float_exception_flags;
+    signed char floatx80_rounding_precision;
+    /* should denormalised results go to zero and set the inexact flag? */
+    flag flush_to_zero;
+    /* should denormalised inputs go to zero and set the input_denormal flag? */
+    flag flush_inputs_to_zero;
+    flag default_nan_mode;
+} float_status;
+
+void set_float_rounding_mode(int val STATUS_PARAM);
+void set_float_exception_flags(int val STATUS_PARAM);
+INLINE void set_float_detect_tininess(int val STATUS_PARAM)
+{
+    STATUS(float_detect_tininess) = val;
+}
+INLINE void set_flush_to_zero(flag val STATUS_PARAM)
+{
+    STATUS(flush_to_zero) = val;
+}
+INLINE void set_flush_inputs_to_zero(flag val STATUS_PARAM)
+{
+    STATUS(flush_inputs_to_zero) = val;
+}
+INLINE void set_default_nan_mode(flag val STATUS_PARAM)
+{
+    STATUS(default_nan_mode) = val;
+}
+INLINE int get_float_exception_flags(float_status *status)
+{
+    return STATUS(float_exception_flags);
+}
+void set_floatx80_rounding_precision(int val STATUS_PARAM);
+
+/*----------------------------------------------------------------------------
+| Routine to raise any or all of the software IEC/IEEE floating-point
+| exception flags.
+*----------------------------------------------------------------------------*/
+void float_raise( int8 flags STATUS_PARAM);
+
+/*----------------------------------------------------------------------------
+| Options to indicate which negations to perform in float*_muladd()
+| Using these differs from negating an input or output before calling
+| the muladd function in that this means that a NaN doesn't have its
+| sign bit inverted before it is propagated.
+*----------------------------------------------------------------------------*/
+enum {
+    float_muladd_negate_c = 1,
+    float_muladd_negate_product = 2,
+    float_muladd_negate_result = 4,
+};
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE integer-to-floating-point conversion routines.
+*----------------------------------------------------------------------------*/
+float32 int32_to_float32( int32 STATUS_PARAM );
+float64 int32_to_float64( int32 STATUS_PARAM );
+float32 uint32_to_float32( uint32 STATUS_PARAM );
+float64 uint32_to_float64( uint32 STATUS_PARAM );
+floatx80 int32_to_floatx80( int32 STATUS_PARAM );
+float128 int32_to_float128( int32 STATUS_PARAM );
+float32 int64_to_float32( int64 STATUS_PARAM );
+float32 uint64_to_float32( uint64 STATUS_PARAM );
+float64 int64_to_float64( int64 STATUS_PARAM );
+float64 uint64_to_float64( uint64 STATUS_PARAM );
+floatx80 int64_to_floatx80( int64 STATUS_PARAM );
+float128 int64_to_float128( int64 STATUS_PARAM );
+float128 uint64_to_float128( uint64 STATUS_PARAM );
+
+/*----------------------------------------------------------------------------
+| Software half-precision conversion routines.
+*----------------------------------------------------------------------------*/
+float16 float32_to_float16( float32, flag STATUS_PARAM );
+float32 float16_to_float32( float16, flag STATUS_PARAM );
+
+/*----------------------------------------------------------------------------
+| Software half-precision operations.
+*----------------------------------------------------------------------------*/
+int float16_is_quiet_nan( float16 );
+int float16_is_signaling_nan( float16 );
+float16 float16_maybe_silence_nan( float16 );
+
+INLINE int float16_is_any_nan(float16 a)
+{
+    return ((float16_val(a) & ~0x8000) > 0x7c00);
+}
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated half-precision NaN.
+*----------------------------------------------------------------------------*/
+extern const float16 float16_default_nan;
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int_fast16_t float32_to_int16_round_to_zero(float32 STATUS_PARAM);
+uint_fast16_t float32_to_uint16_round_to_zero(float32 STATUS_PARAM);
+int32 float32_to_int32( float32 STATUS_PARAM );
+int32 float32_to_int32_round_to_zero( float32 STATUS_PARAM );
+uint32 float32_to_uint32( float32 STATUS_PARAM );
+uint32 float32_to_uint32_round_to_zero( float32 STATUS_PARAM );
+int64 float32_to_int64( float32 STATUS_PARAM );
+int64 float32_to_int64_round_to_zero( float32 STATUS_PARAM );
+float64 float32_to_float64( float32 STATUS_PARAM );
+floatx80 float32_to_floatx80( float32 STATUS_PARAM );
+float128 float32_to_float128( float32 STATUS_PARAM );
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision operations.
+*----------------------------------------------------------------------------*/
+float32 float32_round_to_int( float32 STATUS_PARAM );
+float32 float32_add( float32, float32 STATUS_PARAM );
+float32 float32_sub( float32, float32 STATUS_PARAM );
+float32 float32_mul( float32, float32 STATUS_PARAM );
+float32 float32_div( float32, float32 STATUS_PARAM );
+float32 float32_rem( float32, float32 STATUS_PARAM );
+float32 float32_muladd(float32, float32, float32, int STATUS_PARAM);
+float32 float32_sqrt( float32 STATUS_PARAM );
+float32 float32_exp2( float32 STATUS_PARAM );
+float32 float32_log2( float32 STATUS_PARAM );
+int float32_eq( float32, float32 STATUS_PARAM );
+int float32_le( float32, float32 STATUS_PARAM );
+int float32_lt( float32, float32 STATUS_PARAM );
+int float32_unordered( float32, float32 STATUS_PARAM );
+int float32_eq_quiet( float32, float32 STATUS_PARAM );
+int float32_le_quiet( float32, float32 STATUS_PARAM );
+int float32_lt_quiet( float32, float32 STATUS_PARAM );
+int float32_unordered_quiet( float32, float32 STATUS_PARAM );
+int float32_compare( float32, float32 STATUS_PARAM );
+int float32_compare_quiet( float32, float32 STATUS_PARAM );
+float32 float32_min(float32, float32 STATUS_PARAM);
+float32 float32_max(float32, float32 STATUS_PARAM);
+int float32_is_quiet_nan( float32 );
+int float32_is_signaling_nan( float32 );
+float32 float32_maybe_silence_nan( float32 );
+float32 float32_scalbn( float32, int STATUS_PARAM );
+
+INLINE float32 float32_abs(float32 a)
+{
+    /* Note that abs does *not* handle NaN specially, nor does
+     * it flush denormal inputs to zero.
+     */
+    return make_float32(float32_val(a) & 0x7fffffff);
+}
+
+INLINE float32 float32_chs(float32 a)
+{
+    /* Note that chs does *not* handle NaN specially, nor does
+     * it flush denormal inputs to zero.
+     */
+    return make_float32(float32_val(a) ^ 0x80000000);
+}
+
+INLINE int float32_is_infinity(float32 a)
+{
+    return (float32_val(a) & 0x7fffffff) == 0x7f800000;
+}
+
+INLINE int float32_is_neg(float32 a)
+{
+    return float32_val(a) >> 31;
+}
+
+INLINE int float32_is_zero(float32 a)
+{
+    return (float32_val(a) & 0x7fffffff) == 0;
+}
+
+INLINE int float32_is_any_nan(float32 a)
+{
+    return ((float32_val(a) & ~(1 << 31)) > 0x7f800000UL);
+}
+
+INLINE int float32_is_zero_or_denormal(float32 a)
+{
+    return (float32_val(a) & 0x7f800000) == 0;
+}
+
+INLINE float32 float32_set_sign(float32 a, int sign)
+{
+    return make_float32((float32_val(a) & 0x7fffffff) | (sign << 31));
+}
+
+#define float32_zero make_float32(0)
+#define float32_one make_float32(0x3f800000)
+#define float32_ln2 make_float32(0x3f317218)
+#define float32_pi make_float32(0x40490fdb)
+#define float32_half make_float32(0x3f000000)
+#define float32_infinity make_float32(0x7f800000)
+
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated single-precision NaN.
+*----------------------------------------------------------------------------*/
+extern const float32 float32_default_nan;
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int_fast16_t float64_to_int16_round_to_zero(float64 STATUS_PARAM);
+uint_fast16_t float64_to_uint16_round_to_zero(float64 STATUS_PARAM);
+int32 float64_to_int32( float64 STATUS_PARAM );
+int32 float64_to_int32_round_to_zero( float64 STATUS_PARAM );
+uint32 float64_to_uint32( float64 STATUS_PARAM );
+uint32 float64_to_uint32_round_to_zero( float64 STATUS_PARAM );
+int64 float64_to_int64( float64 STATUS_PARAM );
+int64 float64_to_int64_round_to_zero( float64 STATUS_PARAM );
+uint64 float64_to_uint64 (float64 a STATUS_PARAM);
+uint64 float64_to_uint64_round_to_zero (float64 a STATUS_PARAM);
+float32 float64_to_float32( float64 STATUS_PARAM );
+floatx80 float64_to_floatx80( float64 STATUS_PARAM );
+float128 float64_to_float128( float64 STATUS_PARAM );
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision operations.
+*----------------------------------------------------------------------------*/
+float64 float64_round_to_int( float64 STATUS_PARAM );
+float64 float64_trunc_to_int( float64 STATUS_PARAM );
+float64 float64_add( float64, float64 STATUS_PARAM );
+float64 float64_sub( float64, float64 STATUS_PARAM );
+float64 float64_mul( float64, float64 STATUS_PARAM );
+float64 float64_div( float64, float64 STATUS_PARAM );
+float64 float64_rem( float64, float64 STATUS_PARAM );
+float64 float64_muladd(float64, float64, float64, int STATUS_PARAM);
+float64 float64_sqrt( float64 STATUS_PARAM );
+float64 float64_log2( float64 STATUS_PARAM );
+int float64_eq( float64, float64 STATUS_PARAM );
+int float64_le( float64, float64 STATUS_PARAM );
+int float64_lt( float64, float64 STATUS_PARAM );
+int float64_unordered( float64, float64 STATUS_PARAM );
+int float64_eq_quiet( float64, float64 STATUS_PARAM );
+int float64_le_quiet( float64, float64 STATUS_PARAM );
+int float64_lt_quiet( float64, float64 STATUS_PARAM );
+int float64_unordered_quiet( float64, float64 STATUS_PARAM );
+int float64_compare( float64, float64 STATUS_PARAM );
+int float64_compare_quiet( float64, float64 STATUS_PARAM );
+float64 float64_min(float64, float64 STATUS_PARAM);
+float64 float64_max(float64, float64 STATUS_PARAM);
+int float64_is_quiet_nan( float64 a );
+int float64_is_signaling_nan( float64 );
+float64 float64_maybe_silence_nan( float64 );
+float64 float64_scalbn( float64, int STATUS_PARAM );
+
+INLINE float64 float64_abs(float64 a)
+{
+    /* Note that abs does *not* handle NaN specially, nor does
+     * it flush denormal inputs to zero.
+     */
+    return make_float64(float64_val(a) & 0x7fffffffffffffffLL);
+}
+
+INLINE float64 float64_chs(float64 a)
+{
+    /* Note that chs does *not* handle NaN specially, nor does
+     * it flush denormal inputs to zero.
+     */
+    return make_float64(float64_val(a) ^ 0x8000000000000000LL);
+}
+
+INLINE int float64_is_infinity(float64 a)
+{
+    return (float64_val(a) & 0x7fffffffffffffffLL ) == 0x7ff0000000000000LL;
+}
+
+INLINE int float64_is_neg(float64 a)
+{
+    return float64_val(a) >> 63;
+}
+
+INLINE int float64_is_zero(float64 a)
+{
+    return (float64_val(a) & 0x7fffffffffffffffLL) == 0;
+}
+
+INLINE int float64_is_any_nan(float64 a)
+{
+    return ((float64_val(a) & ~(1ULL << 63)) > 0x7ff0000000000000ULL);
+}
+
+INLINE int float64_is_zero_or_denormal(float64 a)
+{
+    return (float64_val(a) & 0x7ff0000000000000LL) == 0;
+}
+
+INLINE float64 float64_set_sign(float64 a, int sign)
+{
+    return make_float64((float64_val(a) & 0x7fffffffffffffffULL)
+                        | ((int64_t)sign << 63));
+}
+
+#define float64_zero make_float64(0)
+#define float64_one make_float64(0x3ff0000000000000LL)
+#define float64_ln2 make_float64(0x3fe62e42fefa39efLL)
+#define float64_pi make_float64(0x400921fb54442d18LL)
+#define float64_half make_float64(0x3fe0000000000000LL)
+#define float64_infinity make_float64(0x7ff0000000000000LL)
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated double-precision NaN.
+*----------------------------------------------------------------------------*/
+extern const float64 float64_default_nan;
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int32 floatx80_to_int32( floatx80 STATUS_PARAM );
+int32 floatx80_to_int32_round_to_zero( floatx80 STATUS_PARAM );
+int64 floatx80_to_int64( floatx80 STATUS_PARAM );
+int64 floatx80_to_int64_round_to_zero( floatx80 STATUS_PARAM );
+float32 floatx80_to_float32( floatx80 STATUS_PARAM );
+float64 floatx80_to_float64( floatx80 STATUS_PARAM );
+float128 floatx80_to_float128( floatx80 STATUS_PARAM );
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision operations.
+*----------------------------------------------------------------------------*/
+floatx80 floatx80_round_to_int( floatx80 STATUS_PARAM );
+floatx80 floatx80_add( floatx80, floatx80 STATUS_PARAM );
+floatx80 floatx80_sub( floatx80, floatx80 STATUS_PARAM );
+floatx80 floatx80_mul( floatx80, floatx80 STATUS_PARAM );
+floatx80 floatx80_div( floatx80, floatx80 STATUS_PARAM );
+floatx80 floatx80_rem( floatx80, floatx80 STATUS_PARAM );
+floatx80 floatx80_sqrt( floatx80 STATUS_PARAM );
+int floatx80_eq( floatx80, floatx80 STATUS_PARAM );
+int floatx80_le( floatx80, floatx80 STATUS_PARAM );
+int floatx80_lt( floatx80, floatx80 STATUS_PARAM );
+int floatx80_unordered( floatx80, floatx80 STATUS_PARAM );
+int floatx80_eq_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_le_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_unordered_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_compare( floatx80, floatx80 STATUS_PARAM );
+int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_is_quiet_nan( floatx80 );
+int floatx80_is_signaling_nan( floatx80 );
+floatx80 floatx80_maybe_silence_nan( floatx80 );
+floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM );
+
+INLINE floatx80 floatx80_abs(floatx80 a)
+{
+    a.high &= 0x7fff;
+    return a;
+}
+
+INLINE floatx80 floatx80_chs(floatx80 a)
+{
+    a.high ^= 0x8000;
+    return a;
+}
+
+INLINE int floatx80_is_infinity(floatx80 a)
+{
+    return (a.high & 0x7fff) == 0x7fff && a.low == 0x8000000000000000LL;
+}
+
+INLINE int floatx80_is_neg(floatx80 a)
+{
+    return a.high >> 15;
+}
+
+INLINE int floatx80_is_zero(floatx80 a)
+{
+    return (a.high & 0x7fff) == 0 && a.low == 0;
+}
+
+INLINE int floatx80_is_zero_or_denormal(floatx80 a)
+{
+    return (a.high & 0x7fff) == 0;
+}
+
+INLINE int floatx80_is_any_nan(floatx80 a)
+{
+    return ((a.high & 0x7fff) == 0x7fff) && (a.low<<1);
+}
+
+#define floatx80_zero make_floatx80(0x0000, 0x0000000000000000LL)
+#define floatx80_one make_floatx80(0x3fff, 0x8000000000000000LL)
+#define floatx80_ln2 make_floatx80(0x3ffe, 0xb17217f7d1cf79acLL)
+#define floatx80_pi make_floatx80(0x4000, 0xc90fdaa22168c235LL)
+#define floatx80_half make_floatx80(0x3ffe, 0x8000000000000000LL)
+#define floatx80_infinity make_floatx80(0x7fff, 0x8000000000000000LL)
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.
+*----------------------------------------------------------------------------*/
+extern const floatx80 floatx80_default_nan;
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE quadruple-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int32 float128_to_int32( float128 STATUS_PARAM );
+int32 float128_to_int32_round_to_zero( float128 STATUS_PARAM );
+int64 float128_to_int64( float128 STATUS_PARAM );
+int64 float128_to_int64_round_to_zero( float128 STATUS_PARAM );
+float32 float128_to_float32( float128 STATUS_PARAM );
+float64 float128_to_float64( float128 STATUS_PARAM );
+floatx80 float128_to_floatx80( float128 STATUS_PARAM );
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE quadruple-precision operations.
+*----------------------------------------------------------------------------*/
+float128 float128_round_to_int( float128 STATUS_PARAM );
+float128 float128_add( float128, float128 STATUS_PARAM );
+float128 float128_sub( float128, float128 STATUS_PARAM );
+float128 float128_mul( float128, float128 STATUS_PARAM );
+float128 float128_div( float128, float128 STATUS_PARAM );
+float128 float128_rem( float128, float128 STATUS_PARAM );
+float128 float128_sqrt( float128 STATUS_PARAM );
+int float128_eq( float128, float128 STATUS_PARAM );
+int float128_le( float128, float128 STATUS_PARAM );
+int float128_lt( float128, float128 STATUS_PARAM );
+int float128_unordered( float128, float128 STATUS_PARAM );
+int float128_eq_quiet( float128, float128 STATUS_PARAM );
+int float128_le_quiet( float128, float128 STATUS_PARAM );
+int float128_lt_quiet( float128, float128 STATUS_PARAM );
+int float128_unordered_quiet( float128, float128 STATUS_PARAM );
+int float128_compare( float128, float128 STATUS_PARAM );
+int float128_compare_quiet( float128, float128 STATUS_PARAM );
+int float128_is_quiet_nan( float128 );
+int float128_is_signaling_nan( float128 );
+float128 float128_maybe_silence_nan( float128 );
+float128 float128_scalbn( float128, int STATUS_PARAM );
+
+INLINE float128 float128_abs(float128 a)
+{
+    a.high &= 0x7fffffffffffffffLL;
+    return a;
+}
+
+INLINE float128 float128_chs(float128 a)
+{
+    a.high ^= 0x8000000000000000LL;
+    return a;
+}
+
+INLINE int float128_is_infinity(float128 a)
+{
+    return (a.high & 0x7fffffffffffffffLL) == 0x7fff000000000000LL && a.low == 0;
+}
+
+INLINE int float128_is_neg(float128 a)
+{
+    return a.high >> 63;
+}
+
+INLINE int float128_is_zero(float128 a)
+{
+    return (a.high & 0x7fffffffffffffffLL) == 0 && a.low == 0;
+}
+
+INLINE int float128_is_zero_or_denormal(float128 a)
+{
+    return (a.high & 0x7fff000000000000LL) == 0;
+}
+
+INLINE int float128_is_any_nan(float128 a)
+{
+    return ((a.high >> 48) & 0x7fff) == 0x7fff &&
+        ((a.low != 0) || ((a.high & 0xffffffffffffLL) != 0));
+}
+
+#define float128_zero make_float128(0, 0)
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated quadruple-precision NaN.
+*----------------------------------------------------------------------------*/
+extern const float128 float128_default_nan;
+
+#endif /* !SOFTFLOAT_H */
diff --git a/contrib/qemu/include/glib-compat.h b/contrib/qemu/include/glib-compat.h
new file mode 100644
index 000000000..8aa77afd6
--- /dev/null
+++ b/contrib/qemu/include/glib-compat.h
@@ -0,0 +1,27 @@
+/*
+ * GLIB Compatibility Functions
+ *
+ * Copyright IBM, Corp. 2013
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_GLIB_COMPAT_H
+#define QEMU_GLIB_COMPAT_H
+
+#include <glib.h>
+
+#if !GLIB_CHECK_VERSION(2, 14, 0)
+static inline guint g_timeout_add_seconds(guint interval, GSourceFunc function,
+                                          gpointer data)
+{
+    return g_timeout_add(interval * 1000, function, data);
+}
+#endif
+
+#endif
diff --git a/contrib/qemu/include/migration/migration.h b/contrib/qemu/include/migration/migration.h
new file mode 100644
index 000000000..bc9fde0b2
--- /dev/null
+++ b/contrib/qemu/include/migration/migration.h
@@ -0,0 +1,157 @@
+/*
+ * QEMU live migration
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_MIGRATION_H
+#define QEMU_MIGRATION_H
+
+#include "qapi/qmp/qdict.h"
+#include "qemu-common.h"
+#include "qemu/thread.h"
+#include "qemu/notify.h"
+#include "qapi/error.h"
+#include "migration/vmstate.h"
+#include "qapi-types.h"
+#include "exec/cpu-common.h"
+
+struct MigrationParams {
+    bool blk;
+    bool shared;
+};
+
+typedef struct MigrationState MigrationState;
+
+struct MigrationState
+{
+    int64_t bandwidth_limit;
+    size_t bytes_xfer;
+    size_t xfer_limit;
+    QemuThread thread;
+    QEMUBH *cleanup_bh;
+    QEMUFile *file;
+
+    int state;
+    MigrationParams params;
+    double mbps;
+    int64_t total_time;
+    int64_t downtime;
+    int64_t expected_downtime;
+    int64_t dirty_pages_rate;
+    int64_t dirty_bytes_rate;
+    bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
+    int64_t xbzrle_cache_size;
+};
+
+void process_incoming_migration(QEMUFile *f);
+
+void qemu_start_incoming_migration(const char *uri, Error **errp);
+
+uint64_t migrate_max_downtime(void);
+
+void do_info_migrate_print(Monitor *mon, const QObject *data);
+
+void do_info_migrate(Monitor *mon, QObject **ret_data);
+
+void exec_start_incoming_migration(const char *host_port, Error **errp);
+
+void exec_start_outgoing_migration(MigrationState *s, const char *host_port, Error **errp);
+
+void tcp_start_incoming_migration(const char *host_port, Error **errp);
+
+void tcp_start_outgoing_migration(MigrationState *s, const char *host_port, Error **errp);
+
+void unix_start_incoming_migration(const char *path, Error **errp);
+
+void unix_start_outgoing_migration(MigrationState *s, const char *path, Error **errp);
+
+void fd_start_incoming_migration(const char *path, Error **errp);
+
+void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp);
+
+void migrate_fd_error(MigrationState *s);
+
+void migrate_fd_connect(MigrationState *s);
+
+int migrate_fd_close(MigrationState *s);
+
+void add_migration_state_change_notifier(Notifier *notify);
+void remove_migration_state_change_notifier(Notifier *notify);
+bool migration_is_active(MigrationState *);
+bool migration_has_finished(MigrationState *);
+bool migration_has_failed(MigrationState *);
+MigrationState *migrate_get_current(void);
+
+uint64_t ram_bytes_remaining(void);
+uint64_t ram_bytes_transferred(void);
+uint64_t ram_bytes_total(void);
+
+void acct_update_position(QEMUFile *f, size_t size, bool zero);
+
+extern SaveVMHandlers savevm_ram_handlers;
+
+uint64_t dup_mig_bytes_transferred(void);
+uint64_t dup_mig_pages_transferred(void);
+uint64_t skipped_mig_bytes_transferred(void);
+uint64_t skipped_mig_pages_transferred(void);
+uint64_t norm_mig_bytes_transferred(void);
+uint64_t norm_mig_pages_transferred(void);
+uint64_t xbzrle_mig_bytes_transferred(void);
+uint64_t xbzrle_mig_pages_transferred(void);
+uint64_t xbzrle_mig_pages_overflow(void);
+uint64_t xbzrle_mig_pages_cache_miss(void);
+
+/**
+ * @migrate_add_blocker - prevent migration from proceeding
+ *
+ * @reason - an error to be returned whenever migration is attempted
+ */
+void migrate_add_blocker(Error *reason);
+
+/**
+ * @migrate_del_blocker - remove a blocking error from migration
+ *
+ * @reason - the error blocking migration
+ */
+void migrate_del_blocker(Error *reason);
+
+bool migrate_rdma_pin_all(void);
+
+bool migrate_auto_converge(void);
+
+int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                         uint8_t *dst, int dlen);
+int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
+
+int migrate_use_xbzrle(void);
+int64_t migrate_xbzrle_cache_size(void);
+
+int64_t xbzrle_cache_resize(int64_t new_size);
+
+void ram_control_before_iterate(QEMUFile *f, uint64_t flags);
+void ram_control_after_iterate(QEMUFile *f, uint64_t flags);
+void ram_control_load_hook(QEMUFile *f, uint64_t flags);
+
+/* Whenever this is found in the data stream, the flags
+ * will be passed to ram_control_load_hook in the incoming-migration
+ * side. This lets before_ram_iterate/after_ram_iterate add
+ * transport-specific sections to the RAM migration data.
+ */
+#define RAM_SAVE_FLAG_HOOK     0x80
+
+#define RAM_SAVE_CONTROL_NOT_SUPP -1000
+#define RAM_SAVE_CONTROL_DELAYED  -2000
+
+size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
+                             ram_addr_t offset, size_t size,
+                             int *bytes_sent);
+
+#endif
diff --git a/contrib/qemu/include/migration/qemu-file.h b/contrib/qemu/include/migration/qemu-file.h
new file mode 100644
index 000000000..0f757fbeb
--- /dev/null
+++ b/contrib/qemu/include/migration/qemu-file.h
@@ -0,0 +1,266 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef QEMU_FILE_H
+#define QEMU_FILE_H 1
+#include "exec/cpu-common.h"
+
+/* This function writes a chunk of data to a file at the given position.
+ * The pos argument can be ignored if the file is only being used for
+ * streaming.  The handler should try to write all of the data it can.
+ */
+typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
+                                    int64_t pos, int size);
+
+/* Read a chunk of data from a file at the given position.  The pos argument
+ * can be ignored if the file is only be used for streaming.  The number of
+ * bytes actually read should be returned.
+ */
+typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf,
+                                    int64_t pos, int size);
+
+/* Close a file
+ *
+ * Return negative error number on error, 0 or positive value on success.
+ *
+ * The meaning of return value on success depends on the specific back-end being
+ * used.
+ */
+typedef int (QEMUFileCloseFunc)(void *opaque);
+
+/* Called to return the OS file descriptor associated to the QEMUFile.
+ */
+typedef int (QEMUFileGetFD)(void *opaque);
+
+/*
+ * This function writes an iovec to file.
+ */
+typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov,
+                                           int iovcnt, int64_t pos);
+
+/*
+ * This function provides hooks around different
+ * stages of RAM migration.
+ */
+typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags);
+
+/*
+ * Constants used by ram_control_* hooks
+ */
+#define RAM_CONTROL_SETUP    0
+#define RAM_CONTROL_ROUND    1
+#define RAM_CONTROL_HOOK     2
+#define RAM_CONTROL_FINISH   3
+
+/*
+ * This function allows override of where the RAM page
+ * is saved (such as RDMA, for example.)
+ */
+typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque,
+                               ram_addr_t block_offset,
+                               ram_addr_t offset,
+                               size_t size,
+                               int *bytes_sent);
+
+typedef struct QEMUFileOps {
+    QEMUFilePutBufferFunc *put_buffer;
+    QEMUFileGetBufferFunc *get_buffer;
+    QEMUFileCloseFunc *close;
+    QEMUFileGetFD *get_fd;
+    QEMUFileWritevBufferFunc *writev_buffer;
+    QEMURamHookFunc *before_ram_iterate;
+    QEMURamHookFunc *after_ram_iterate;
+    QEMURamHookFunc *hook_ram_load;
+    QEMURamSaveFunc *save_page;
+} QEMUFileOps;
+
+QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
+QEMUFile *qemu_fopen(const char *filename, const char *mode);
+QEMUFile *qemu_fdopen(int fd, const char *mode);
+QEMUFile *qemu_fopen_socket(int fd, const char *mode);
+QEMUFile *qemu_popen_cmd(const char *command, const char *mode);
+int qemu_get_fd(QEMUFile *f);
+int qemu_fclose(QEMUFile *f);
+int64_t qemu_ftell(QEMUFile *f);
+void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size);
+void qemu_put_byte(QEMUFile *f, int v);
+/*
+ * put_buffer without copying the buffer.
+ * The buffer should be available till it is sent asynchronously.
+ */
+void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size);
+bool qemu_file_mode_is_not_valid(const char *mode);
+
+static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v)
+{
+    qemu_put_byte(f, (int)v);
+}
+
+#define qemu_put_sbyte qemu_put_byte
+
+void qemu_put_be16(QEMUFile *f, unsigned int v);
+void qemu_put_be32(QEMUFile *f, unsigned int v);
+void qemu_put_be64(QEMUFile *f, uint64_t v);
+int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size);
+int qemu_get_byte(QEMUFile *f);
+void qemu_update_position(QEMUFile *f, size_t size);
+
+static inline unsigned int qemu_get_ubyte(QEMUFile *f)
+{
+    return (unsigned int)qemu_get_byte(f);
+}
+
+#define qemu_get_sbyte qemu_get_byte
+
+unsigned int qemu_get_be16(QEMUFile *f);
+unsigned int qemu_get_be32(QEMUFile *f);
+uint64_t qemu_get_be64(QEMUFile *f);
+
+int qemu_file_rate_limit(QEMUFile *f);
+void qemu_file_reset_rate_limit(QEMUFile *f);
+void qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
+int64_t qemu_file_get_rate_limit(QEMUFile *f);
+int qemu_file_get_error(QEMUFile *f);
+void qemu_fflush(QEMUFile *f);
+
+static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
+{
+    qemu_put_be64(f, *pv);
+}
+
+static inline void qemu_put_be32s(QEMUFile *f, const uint32_t *pv)
+{
+    qemu_put_be32(f, *pv);
+}
+
+static inline void qemu_put_be16s(QEMUFile *f, const uint16_t *pv)
+{
+    qemu_put_be16(f, *pv);
+}
+
+static inline void qemu_put_8s(QEMUFile *f, const uint8_t *pv)
+{
+    qemu_put_byte(f, *pv);
+}
+
+static inline void qemu_get_be64s(QEMUFile *f, uint64_t *pv)
+{
+    *pv = qemu_get_be64(f);
+}
+
+static inline void qemu_get_be32s(QEMUFile *f, uint32_t *pv)
+{
+    *pv = qemu_get_be32(f);
+}
+
+static inline void qemu_get_be16s(QEMUFile *f, uint16_t *pv)
+{
+    *pv = qemu_get_be16(f);
+}
+
+static inline void qemu_get_8s(QEMUFile *f, uint8_t *pv)
+{
+    *pv = qemu_get_byte(f);
+}
+
+// Signed versions for type safety
+static inline void qemu_put_sbuffer(QEMUFile *f, const int8_t *buf, int size)
+{
+    qemu_put_buffer(f, (const uint8_t *)buf, size);
+}
+
+static inline void qemu_put_sbe16(QEMUFile *f, int v)
+{
+    qemu_put_be16(f, (unsigned int)v);
+}
+
+static inline void qemu_put_sbe32(QEMUFile *f, int v)
+{
+    qemu_put_be32(f, (unsigned int)v);
+}
+
+static inline void qemu_put_sbe64(QEMUFile *f, int64_t v)
+{
+    qemu_put_be64(f, (uint64_t)v);
+}
+
+static inline size_t qemu_get_sbuffer(QEMUFile *f, int8_t *buf, int size)
+{
+    return qemu_get_buffer(f, (uint8_t *)buf, size);
+}
+
+static inline int qemu_get_sbe16(QEMUFile *f)
+{
+    return (int)qemu_get_be16(f);
+}
+
+static inline int qemu_get_sbe32(QEMUFile *f)
+{
+    return (int)qemu_get_be32(f);
+}
+
+static inline int64_t qemu_get_sbe64(QEMUFile *f)
+{
+    return (int64_t)qemu_get_be64(f);
+}
+
+static inline void qemu_put_s8s(QEMUFile *f, const int8_t *pv)
+{
+    qemu_put_8s(f, (const uint8_t *)pv);
+}
+
+static inline void qemu_put_sbe16s(QEMUFile *f, const int16_t *pv)
+{
+    qemu_put_be16s(f, (const uint16_t *)pv);
+}
+
+static inline void qemu_put_sbe32s(QEMUFile *f, const int32_t *pv)
+{
+    qemu_put_be32s(f, (const uint32_t *)pv);
+}
+
+static inline void qemu_put_sbe64s(QEMUFile *f, const int64_t *pv)
+{
+    qemu_put_be64s(f, (const uint64_t *)pv);
+}
+
+static inline void qemu_get_s8s(QEMUFile *f, int8_t *pv)
+{
+    qemu_get_8s(f, (uint8_t *)pv);
+}
+
+static inline void qemu_get_sbe16s(QEMUFile *f, int16_t *pv)
+{
+    qemu_get_be16s(f, (uint16_t *)pv);
+}
+
+static inline void qemu_get_sbe32s(QEMUFile *f, int32_t *pv)
+{
+    qemu_get_be32s(f, (uint32_t *)pv);
+}
+
+static inline void qemu_get_sbe64s(QEMUFile *f, int64_t *pv)
+{
+    qemu_get_be64s(f, (uint64_t *)pv);
+}
+#endif
diff --git a/contrib/qemu/include/migration/vmstate.h b/contrib/qemu/include/migration/vmstate.h
new file mode 100644
index 000000000..1c31b5d6f
--- /dev/null
+++ b/contrib/qemu/include/migration/vmstate.h
@@ -0,0 +1,740 @@
+/*
+ * QEMU migration/snapshot declarations
+ *
+ * Copyright (c) 2009-2011 Red Hat, Inc.
+ *
+ * Original author: Juan Quintela <quintela@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef QEMU_VMSTATE_H
+#define QEMU_VMSTATE_H 1
+
+#ifndef CONFIG_USER_ONLY
+#include <migration/qemu-file.h>
+#endif
+
+typedef void SaveStateHandler(QEMUFile *f, void *opaque);
+typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id);
+
+typedef struct SaveVMHandlers {
+    /* This runs inside the iothread lock.  */
+    void (*set_params)(const MigrationParams *params, void * opaque);
+    SaveStateHandler *save_state;
+
+    void (*cancel)(void *opaque);
+    int (*save_live_complete)(QEMUFile *f, void *opaque);
+
+    /* This runs both outside and inside the iothread lock.  */
+    bool (*is_active)(void *opaque);
+
+    /* This runs outside the iothread lock in the migration case, and
+     * within the lock in the savevm case.  The callback had better only
+     * use data that is local to the migration thread or protected
+     * by other locks.
+     */
+    int (*save_live_iterate)(QEMUFile *f, void *opaque);
+
+    /* This runs outside the iothread lock!  */
+    int (*save_live_setup)(QEMUFile *f, void *opaque);
+    uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size);
+
+    LoadStateHandler *load_state;
+} SaveVMHandlers;
+
+int register_savevm(DeviceState *dev,
+                    const char *idstr,
+                    int instance_id,
+                    int version_id,
+                    SaveStateHandler *save_state,
+                    LoadStateHandler *load_state,
+                    void *opaque);
+
+int register_savevm_live(DeviceState *dev,
+                         const char *idstr,
+                         int instance_id,
+                         int version_id,
+                         SaveVMHandlers *ops,
+                         void *opaque);
+
+void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque);
+void register_device_unmigratable(DeviceState *dev, const char *idstr,
+                                                                void *opaque);
+
+
+typedef struct VMStateInfo VMStateInfo;
+typedef struct VMStateDescription VMStateDescription;
+
+struct VMStateInfo {
+    const char *name;
+    int (*get)(QEMUFile *f, void *pv, size_t size);
+    void (*put)(QEMUFile *f, void *pv, size_t size);
+};
+
+enum VMStateFlags {
+    VMS_SINGLE           = 0x001,
+    VMS_POINTER          = 0x002,
+    VMS_ARRAY            = 0x004,
+    VMS_STRUCT           = 0x008,
+    VMS_VARRAY_INT32     = 0x010,  /* Array with size in int32_t field*/
+    VMS_BUFFER           = 0x020,  /* static sized buffer */
+    VMS_ARRAY_OF_POINTER = 0x040,
+    VMS_VARRAY_UINT16    = 0x080,  /* Array with size in uint16_t field */
+    VMS_VBUFFER          = 0x100,  /* Buffer with size in int32_t field */
+    VMS_MULTIPLY         = 0x200,  /* multiply "size" field by field_size */
+    VMS_VARRAY_UINT8     = 0x400,  /* Array with size in uint8_t field*/
+    VMS_VARRAY_UINT32    = 0x800,  /* Array with size in uint32_t field*/
+};
+
+typedef struct {
+    const char *name;
+    size_t offset;
+    size_t size;
+    size_t start;
+    int num;
+    size_t num_offset;
+    size_t size_offset;
+    const VMStateInfo *info;
+    enum VMStateFlags flags;
+    const VMStateDescription *vmsd;
+    int version_id;
+    bool (*field_exists)(void *opaque, int version_id);
+} VMStateField;
+
+typedef struct VMStateSubsection {
+    const VMStateDescription *vmsd;
+    bool (*needed)(void *opaque);
+} VMStateSubsection;
+
+struct VMStateDescription {
+    const char *name;
+    int unmigratable;
+    int version_id;
+    int minimum_version_id;
+    int minimum_version_id_old;
+    LoadStateHandler *load_state_old;
+    int (*pre_load)(void *opaque);
+    int (*post_load)(void *opaque, int version_id);
+    void (*pre_save)(void *opaque);
+    VMStateField *fields;
+    const VMStateSubsection *subsections;
+};
+
+#ifdef CONFIG_USER_ONLY
+extern const VMStateDescription vmstate_dummy;
+#endif
+
+extern const VMStateInfo vmstate_info_bool;
+
+extern const VMStateInfo vmstate_info_int8;
+extern const VMStateInfo vmstate_info_int16;
+extern const VMStateInfo vmstate_info_int32;
+extern const VMStateInfo vmstate_info_int64;
+
+extern const VMStateInfo vmstate_info_uint8_equal;
+extern const VMStateInfo vmstate_info_uint16_equal;
+extern const VMStateInfo vmstate_info_int32_equal;
+extern const VMStateInfo vmstate_info_uint32_equal;
+extern const VMStateInfo vmstate_info_uint64_equal;
+extern const VMStateInfo vmstate_info_int32_le;
+
+extern const VMStateInfo vmstate_info_uint8;
+extern const VMStateInfo vmstate_info_uint16;
+extern const VMStateInfo vmstate_info_uint32;
+extern const VMStateInfo vmstate_info_uint64;
+
+extern const VMStateInfo vmstate_info_float64;
+
+extern const VMStateInfo vmstate_info_timer;
+extern const VMStateInfo vmstate_info_buffer;
+extern const VMStateInfo vmstate_info_unused_buffer;
+extern const VMStateInfo vmstate_info_bitmap;
+
+#define type_check_2darray(t1,t2,n,m) ((t1(*)[n][m])0 - (t2*)0)
+#define type_check_array(t1,t2,n) ((t1(*)[n])0 - (t2*)0)
+#define type_check_pointer(t1,t2) ((t1**)0 - (t2*)0)
+
+#define vmstate_offset_value(_state, _field, _type)                  \
+    (offsetof(_state, _field) +                                      \
+     type_check(_type, typeof_field(_state, _field)))
+
+#define vmstate_offset_pointer(_state, _field, _type)                \
+    (offsetof(_state, _field) +                                      \
+     type_check_pointer(_type, typeof_field(_state, _field)))
+
+#define vmstate_offset_array(_state, _field, _type, _num)            \
+    (offsetof(_state, _field) +                                      \
+     type_check_array(_type, typeof_field(_state, _field), _num))
+
+#define vmstate_offset_2darray(_state, _field, _type, _n1, _n2)      \
+    (offsetof(_state, _field) +                                      \
+     type_check_2darray(_type, typeof_field(_state, _field), _n1, _n2))
+
+#define vmstate_offset_sub_array(_state, _field, _type, _start)      \
+    (offsetof(_state, _field[_start]))
+
+#define vmstate_offset_buffer(_state, _field)                        \
+    vmstate_offset_array(_state, _field, uint8_t,                    \
+                         sizeof(typeof_field(_state, _field)))
+
+#define VMSTATE_SINGLE_TEST(_field, _state, _test, _version, _info, _type) { \
+    .name         = (stringify(_field)),                             \
+    .version_id   = (_version),                                      \
+    .field_exists = (_test),                                         \
+    .size         = sizeof(_type),                                   \
+    .info         = &(_info),                                        \
+    .flags        = VMS_SINGLE,                                      \
+    .offset       = vmstate_offset_value(_state, _field, _type),     \
+}
+
+#define VMSTATE_POINTER(_field, _state, _version, _info, _type) {    \
+    .name       = (stringify(_field)),                               \
+    .version_id = (_version),                                        \
+    .info       = &(_info),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_SINGLE|VMS_POINTER,                            \
+    .offset     = vmstate_offset_value(_state, _field, _type),       \
+}
+
+#define VMSTATE_POINTER_TEST(_field, _state, _test, _info, _type) {  \
+    .name       = (stringify(_field)),                               \
+    .info       = &(_info),                                          \
+    .field_exists = (_test),                                         \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_SINGLE|VMS_POINTER,                            \
+    .offset     = vmstate_offset_value(_state, _field, _type),       \
+}
+
+#define VMSTATE_ARRAY(_field, _state, _num, _version, _info, _type) {\
+    .name       = (stringify(_field)),                               \
+    .version_id = (_version),                                        \
+    .num        = (_num),                                            \
+    .info       = &(_info),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_ARRAY,                                         \
+    .offset     = vmstate_offset_array(_state, _field, _type, _num), \
+}
+
+#define VMSTATE_2DARRAY(_field, _state, _n1, _n2, _version, _info, _type) { \
+    .name       = (stringify(_field)),                                      \
+    .version_id = (_version),                                               \
+    .num        = (_n1) * (_n2),                                            \
+    .info       = &(_info),                                                 \
+    .size       = sizeof(_type),                                            \
+    .flags      = VMS_ARRAY,                                                \
+    .offset     = vmstate_offset_2darray(_state, _field, _type, _n1, _n2),  \
+}
+
+#define VMSTATE_ARRAY_TEST(_field, _state, _num, _test, _info, _type) {\
+    .name         = (stringify(_field)),                              \
+    .field_exists = (_test),                                          \
+    .num          = (_num),                                           \
+    .info         = &(_info),                                         \
+    .size         = sizeof(_type),                                    \
+    .flags        = VMS_ARRAY,                                        \
+    .offset       = vmstate_offset_array(_state, _field, _type, _num),\
+}
+
+#define VMSTATE_SUB_ARRAY(_field, _state, _start, _num, _version, _info, _type) { \
+    .name       = (stringify(_field)),                               \
+    .version_id = (_version),                                        \
+    .num        = (_num),                                            \
+    .info       = &(_info),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_ARRAY,                                         \
+    .offset     = vmstate_offset_sub_array(_state, _field, _type, _start), \
+}
+
+#define VMSTATE_ARRAY_INT32_UNSAFE(_field, _state, _field_num, _info, _type) {\
+    .name       = (stringify(_field)),                               \
+    .num_offset = vmstate_offset_value(_state, _field_num, int32_t), \
+    .info       = &(_info),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_VARRAY_INT32,                                  \
+    .offset     = offsetof(_state, _field),                          \
+}
+
+#define VMSTATE_VARRAY_INT32(_field, _state, _field_num, _version, _info, _type) {\
+    .name       = (stringify(_field)),                               \
+    .version_id = (_version),                                        \
+    .num_offset = vmstate_offset_value(_state, _field_num, int32_t), \
+    .info       = &(_info),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_VARRAY_INT32|VMS_POINTER,                      \
+    .offset     = vmstate_offset_pointer(_state, _field, _type),     \
+}
+
+#define VMSTATE_VARRAY_UINT32(_field, _state, _field_num, _version, _info, _type) {\
+    .name       = (stringify(_field)),                               \
+    .version_id = (_version),                                        \
+    .num_offset = vmstate_offset_value(_state, _field_num, uint32_t),\
+    .info       = &(_info),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_VARRAY_UINT32|VMS_POINTER,                     \
+    .offset     = vmstate_offset_pointer(_state, _field, _type),     \
+}
+
+#define VMSTATE_VARRAY_UINT16_UNSAFE(_field, _state, _field_num, _version, _info, _type) {\
+    .name       = (stringify(_field)),                               \
+    .version_id = (_version),                                        \
+    .num_offset = vmstate_offset_value(_state, _field_num, uint16_t),\
+    .info       = &(_info),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_VARRAY_UINT16,                                 \
+    .offset     = offsetof(_state, _field),                          \
+}
+
+#define VMSTATE_STRUCT_TEST(_field, _state, _test, _version, _vmsd, _type) { \
+    .name         = (stringify(_field)),                             \
+    .version_id   = (_version),                                      \
+    .field_exists = (_test),                                         \
+    .vmsd         = &(_vmsd),                                        \
+    .size         = sizeof(_type),                                   \
+    .flags        = VMS_STRUCT,                                      \
+    .offset       = vmstate_offset_value(_state, _field, _type),     \
+}
+
+#define VMSTATE_STRUCT_POINTER_TEST(_field, _state, _test, _vmsd, _type) { \
+    .name         = (stringify(_field)),                             \
+    .field_exists = (_test),                                         \
+    .vmsd         = &(_vmsd),                                        \
+    .size         = sizeof(_type),                                   \
+    .flags        = VMS_STRUCT|VMS_POINTER,                          \
+    .offset       = vmstate_offset_value(_state, _field, _type),     \
+}
+
+#define VMSTATE_ARRAY_OF_POINTER(_field, _state, _num, _version, _info, _type) {\
+    .name       = (stringify(_field)),                               \
+    .version_id = (_version),                                        \
+    .num        = (_num),                                            \
+    .info       = &(_info),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_ARRAY|VMS_ARRAY_OF_POINTER,                    \
+    .offset     = vmstate_offset_array(_state, _field, _type, _num), \
+}
+
+#define VMSTATE_STRUCT_ARRAY_TEST(_field, _state, _num, _test, _version, _vmsd, _type) { \
+    .name         = (stringify(_field)),                             \
+    .num          = (_num),                                          \
+    .field_exists = (_test),                                         \
+    .version_id   = (_version),                                      \
+    .vmsd         = &(_vmsd),                                        \
+    .size         = sizeof(_type),                                   \
+    .flags        = VMS_STRUCT|VMS_ARRAY,                            \
+    .offset       = vmstate_offset_array(_state, _field, _type, _num),\
+}
+
+#define VMSTATE_STRUCT_VARRAY_UINT8(_field, _state, _field_num, _version, _vmsd, _type) { \
+    .name       = (stringify(_field)),                               \
+    .num_offset = vmstate_offset_value(_state, _field_num, uint8_t), \
+    .version_id = (_version),                                        \
+    .vmsd       = &(_vmsd),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_STRUCT|VMS_VARRAY_UINT8,                       \
+    .offset     = offsetof(_state, _field),                          \
+}
+
+#define VMSTATE_STRUCT_VARRAY_POINTER_INT32(_field, _state, _field_num, _vmsd, _type) { \
+    .name       = (stringify(_field)),                               \
+    .version_id = 0,                                                 \
+    .num_offset = vmstate_offset_value(_state, _field_num, int32_t), \
+    .size       = sizeof(_type),                                     \
+    .vmsd       = &(_vmsd),                                          \
+    .flags      = VMS_POINTER | VMS_VARRAY_INT32 | VMS_STRUCT,       \
+    .offset     = vmstate_offset_pointer(_state, _field, _type),     \
+}
+
+#define VMSTATE_STRUCT_VARRAY_POINTER_UINT32(_field, _state, _field_num, _vmsd, _type) { \
+    .name       = (stringify(_field)),                               \
+    .version_id = 0,                                                 \
+    .num_offset = vmstate_offset_value(_state, _field_num, uint32_t),\
+    .size       = sizeof(_type),                                     \
+    .vmsd       = &(_vmsd),                                          \
+    .flags      = VMS_POINTER | VMS_VARRAY_INT32 | VMS_STRUCT,       \
+    .offset     = vmstate_offset_pointer(_state, _field, _type),     \
+}
+
+#define VMSTATE_STRUCT_VARRAY_POINTER_UINT16(_field, _state, _field_num, _vmsd, _type) { \
+    .name       = (stringify(_field)),                               \
+    .version_id = 0,                                                 \
+    .num_offset = vmstate_offset_value(_state, _field_num, uint16_t),\
+    .size       = sizeof(_type),                                     \
+    .vmsd       = &(_vmsd),                                          \
+    .flags      = VMS_POINTER | VMS_VARRAY_UINT16 | VMS_STRUCT,      \
+    .offset     = vmstate_offset_pointer(_state, _field, _type),     \
+}
+
+#define VMSTATE_STRUCT_VARRAY_INT32(_field, _state, _field_num, _version, _vmsd, _type) { \
+    .name       = (stringify(_field)),                               \
+    .num_offset = vmstate_offset_value(_state, _field_num, int32_t), \
+    .version_id = (_version),                                        \
+    .vmsd       = &(_vmsd),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_STRUCT|VMS_VARRAY_INT32,                       \
+    .offset     = offsetof(_state, _field),                          \
+}
+
+#define VMSTATE_STRUCT_VARRAY_UINT32(_field, _state, _field_num, _version, _vmsd, _type) { \
+    .name       = (stringify(_field)),                               \
+    .num_offset = vmstate_offset_value(_state, _field_num, uint32_t), \
+    .version_id = (_version),                                        \
+    .vmsd       = &(_vmsd),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_STRUCT|VMS_VARRAY_UINT32,                      \
+    .offset     = offsetof(_state, _field),                          \
+}
+
+#define VMSTATE_STATIC_BUFFER(_field, _state, _version, _test, _start, _size) { \
+    .name         = (stringify(_field)),                             \
+    .version_id   = (_version),                                      \
+    .field_exists = (_test),                                         \
+    .size         = (_size - _start),                                \
+    .info         = &vmstate_info_buffer,                            \
+    .flags        = VMS_BUFFER,                                      \
+    .offset       = vmstate_offset_buffer(_state, _field) + _start,  \
+}
+
+#define VMSTATE_VBUFFER_MULTIPLY(_field, _state, _version, _test, _start, _field_size, _multiply) { \
+    .name         = (stringify(_field)),                             \
+    .version_id   = (_version),                                      \
+    .field_exists = (_test),                                         \
+    .size_offset  = vmstate_offset_value(_state, _field_size, uint32_t),\
+    .size         = (_multiply),                                      \
+    .info         = &vmstate_info_buffer,                            \
+    .flags        = VMS_VBUFFER|VMS_POINTER|VMS_MULTIPLY,            \
+    .offset       = offsetof(_state, _field),                        \
+    .start        = (_start),                                        \
+}
+
+#define VMSTATE_VBUFFER(_field, _state, _version, _test, _start, _field_size) { \
+    .name         = (stringify(_field)),                             \
+    .version_id   = (_version),                                      \
+    .field_exists = (_test),                                         \
+    .size_offset  = vmstate_offset_value(_state, _field_size, int32_t),\
+    .info         = &vmstate_info_buffer,                            \
+    .flags        = VMS_VBUFFER|VMS_POINTER,                         \
+    .offset       = offsetof(_state, _field),                        \
+    .start        = (_start),                                        \
+}
+
+#define VMSTATE_VBUFFER_UINT32(_field, _state, _version, _test, _start, _field_size) { \
+    .name         = (stringify(_field)),                             \
+    .version_id   = (_version),                                      \
+    .field_exists = (_test),                                         \
+    .size_offset  = vmstate_offset_value(_state, _field_size, uint32_t),\
+    .info         = &vmstate_info_buffer,                            \
+    .flags        = VMS_VBUFFER|VMS_POINTER,                         \
+    .offset       = offsetof(_state, _field),                        \
+    .start        = (_start),                                        \
+}
+
+#define VMSTATE_BUFFER_UNSAFE_INFO(_field, _state, _version, _info, _size) { \
+    .name       = (stringify(_field)),                               \
+    .version_id = (_version),                                        \
+    .size       = (_size),                                           \
+    .info       = &(_info),                                          \
+    .flags      = VMS_BUFFER,                                        \
+    .offset     = offsetof(_state, _field),                          \
+}
+
+#define VMSTATE_BUFFER_POINTER_UNSAFE(_field, _state, _version, _size) { \
+    .name       = (stringify(_field)),                               \
+    .version_id = (_version),                                        \
+    .size       = (_size),                                           \
+    .info       = &vmstate_info_buffer,                              \
+    .flags      = VMS_BUFFER|VMS_POINTER,                            \
+    .offset     = offsetof(_state, _field),                          \
+}
+
+#define VMSTATE_UNUSED_BUFFER(_test, _version, _size) {              \
+    .name         = "unused",                                        \
+    .field_exists = (_test),                                         \
+    .version_id   = (_version),                                      \
+    .size         = (_size),                                         \
+    .info         = &vmstate_info_unused_buffer,                     \
+    .flags        = VMS_BUFFER,                                      \
+}
+
+/* _field_size should be a int32_t field in the _state struct giving the
+ * size of the bitmap _field in bits.
+ */
+#define VMSTATE_BITMAP(_field, _state, _version, _field_size) {      \
+    .name         = (stringify(_field)),                             \
+    .version_id   = (_version),                                      \
+    .size_offset  = vmstate_offset_value(_state, _field_size, int32_t),\
+    .info         = &vmstate_info_bitmap,                            \
+    .flags        = VMS_VBUFFER|VMS_POINTER,                         \
+    .offset       = offsetof(_state, _field),                        \
+}
+
+/* _f : field name
+   _f_n : num of elements field_name
+   _n : num of elements
+   _s : struct state name
+   _v : version
+*/
+
+#define VMSTATE_SINGLE(_field, _state, _version, _info, _type)        \
+    VMSTATE_SINGLE_TEST(_field, _state, NULL, _version, _info, _type)
+
+#define VMSTATE_STRUCT(_field, _state, _version, _vmsd, _type)        \
+    VMSTATE_STRUCT_TEST(_field, _state, NULL, _version, _vmsd, _type)
+
+#define VMSTATE_STRUCT_POINTER(_field, _state, _vmsd, _type)          \
+    VMSTATE_STRUCT_POINTER_TEST(_field, _state, NULL, _vmsd, _type)
+
+#define VMSTATE_STRUCT_ARRAY(_field, _state, _num, _version, _vmsd, _type) \
+    VMSTATE_STRUCT_ARRAY_TEST(_field, _state, _num, NULL, _version,   \
+            _vmsd, _type)
+
+#define VMSTATE_BOOL_V(_f, _s, _v)                                    \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_bool, bool)
+
+#define VMSTATE_INT8_V(_f, _s, _v)                                    \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_int8, int8_t)
+#define VMSTATE_INT16_V(_f, _s, _v)                                   \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_int16, int16_t)
+#define VMSTATE_INT32_V(_f, _s, _v)                                   \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_int32, int32_t)
+#define VMSTATE_INT64_V(_f, _s, _v)                                   \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_int64, int64_t)
+
+#define VMSTATE_UINT8_V(_f, _s, _v)                                   \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint8, uint8_t)
+#define VMSTATE_UINT16_V(_f, _s, _v)                                  \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint16, uint16_t)
+#define VMSTATE_UINT32_V(_f, _s, _v)                                  \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint32, uint32_t)
+#define VMSTATE_UINT64_V(_f, _s, _v)                                  \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint64, uint64_t)
+
+#define VMSTATE_BOOL(_f, _s)                                          \
+    VMSTATE_BOOL_V(_f, _s, 0)
+
+#define VMSTATE_INT8(_f, _s)                                          \
+    VMSTATE_INT8_V(_f, _s, 0)
+#define VMSTATE_INT16(_f, _s)                                         \
+    VMSTATE_INT16_V(_f, _s, 0)
+#define VMSTATE_INT32(_f, _s)                                         \
+    VMSTATE_INT32_V(_f, _s, 0)
+#define VMSTATE_INT64(_f, _s)                                         \
+    VMSTATE_INT64_V(_f, _s, 0)
+
+#define VMSTATE_UINT8(_f, _s)                                         \
+    VMSTATE_UINT8_V(_f, _s, 0)
+#define VMSTATE_UINT16(_f, _s)                                        \
+    VMSTATE_UINT16_V(_f, _s, 0)
+#define VMSTATE_UINT32(_f, _s)                                        \
+    VMSTATE_UINT32_V(_f, _s, 0)
+#define VMSTATE_UINT64(_f, _s)                                        \
+    VMSTATE_UINT64_V(_f, _s, 0)
+
+#define VMSTATE_UINT8_EQUAL(_f, _s)                                   \
+    VMSTATE_SINGLE(_f, _s, 0, vmstate_info_uint8_equal, uint8_t)
+
+#define VMSTATE_UINT16_EQUAL(_f, _s)                                  \
+    VMSTATE_SINGLE(_f, _s, 0, vmstate_info_uint16_equal, uint16_t)
+
+#define VMSTATE_UINT16_EQUAL_V(_f, _s, _v)                            \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint16_equal, uint16_t)
+
+#define VMSTATE_INT32_EQUAL(_f, _s)                                   \
+    VMSTATE_SINGLE(_f, _s, 0, vmstate_info_int32_equal, int32_t)
+
+#define VMSTATE_UINT32_EQUAL_V(_f, _s, _v)                            \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint32_equal, uint32_t)
+
+#define VMSTATE_UINT32_EQUAL(_f, _s)                                  \
+    VMSTATE_UINT32_EQUAL_V(_f, _s, 0)
+
+#define VMSTATE_UINT64_EQUAL_V(_f, _s, _v)                            \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint64_equal, uint64_t)
+
+#define VMSTATE_UINT64_EQUAL(_f, _s)                                  \
+    VMSTATE_UINT64_EQUAL_V(_f, _s, 0)
+
+#define VMSTATE_INT32_LE(_f, _s)                                   \
+    VMSTATE_SINGLE(_f, _s, 0, vmstate_info_int32_le, int32_t)
+
+#define VMSTATE_UINT8_TEST(_f, _s, _t)                               \
+    VMSTATE_SINGLE_TEST(_f, _s, _t, 0, vmstate_info_uint8, uint8_t)
+
+#define VMSTATE_UINT16_TEST(_f, _s, _t)                               \
+    VMSTATE_SINGLE_TEST(_f, _s, _t, 0, vmstate_info_uint16, uint16_t)
+
+#define VMSTATE_UINT32_TEST(_f, _s, _t)                                  \
+    VMSTATE_SINGLE_TEST(_f, _s, _t, 0, vmstate_info_uint32, uint32_t)
+
+
+#define VMSTATE_FLOAT64_V(_f, _s, _v)                                 \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_float64, float64)
+
+#define VMSTATE_FLOAT64(_f, _s)                                       \
+    VMSTATE_FLOAT64_V(_f, _s, 0)
+
+#define VMSTATE_TIMER_TEST(_f, _s, _test)                             \
+    VMSTATE_POINTER_TEST(_f, _s, _test, vmstate_info_timer, QEMUTimer *)
+
+#define VMSTATE_TIMER_V(_f, _s, _v)                                   \
+    VMSTATE_POINTER(_f, _s, _v, vmstate_info_timer, QEMUTimer *)
+
+#define VMSTATE_TIMER(_f, _s)                                         \
+    VMSTATE_TIMER_V(_f, _s, 0)
+
+#define VMSTATE_TIMER_ARRAY(_f, _s, _n)                              \
+    VMSTATE_ARRAY_OF_POINTER(_f, _s, _n, 0, vmstate_info_timer, QEMUTimer *)
+
+#define VMSTATE_BOOL_ARRAY_V(_f, _s, _n, _v)                         \
+    VMSTATE_ARRAY(_f, _s, _n, _v, vmstate_info_bool, bool)
+
+#define VMSTATE_BOOL_ARRAY(_f, _s, _n)                               \
+    VMSTATE_BOOL_ARRAY_V(_f, _s, _n, 0)
+
+#define VMSTATE_UINT16_ARRAY_V(_f, _s, _n, _v)                         \
+    VMSTATE_ARRAY(_f, _s, _n, _v, vmstate_info_uint16, uint16_t)
+
+#define VMSTATE_UINT16_2DARRAY_V(_f, _s, _n1, _n2, _v)                \
+    VMSTATE_2DARRAY(_f, _s, _n1, _n2, _v, vmstate_info_uint16, uint16_t)
+
+#define VMSTATE_UINT16_ARRAY(_f, _s, _n)                               \
+    VMSTATE_UINT16_ARRAY_V(_f, _s, _n, 0)
+
+#define VMSTATE_UINT16_2DARRAY(_f, _s, _n1, _n2)                      \
+    VMSTATE_UINT16_2DARRAY_V(_f, _s, _n1, _n2, 0)
+
+#define VMSTATE_UINT8_2DARRAY_V(_f, _s, _n1, _n2, _v)                 \
+    VMSTATE_2DARRAY(_f, _s, _n1, _n2, _v, vmstate_info_uint8, uint8_t)
+
+#define VMSTATE_UINT8_ARRAY_V(_f, _s, _n, _v)                         \
+    VMSTATE_ARRAY(_f, _s, _n, _v, vmstate_info_uint8, uint8_t)
+
+#define VMSTATE_UINT8_ARRAY(_f, _s, _n)                               \
+    VMSTATE_UINT8_ARRAY_V(_f, _s, _n, 0)
+
+#define VMSTATE_UINT8_2DARRAY(_f, _s, _n1, _n2)                       \
+    VMSTATE_UINT8_2DARRAY_V(_f, _s, _n1, _n2, 0)
+
+#define VMSTATE_UINT32_ARRAY_V(_f, _s, _n, _v)                        \
+    VMSTATE_ARRAY(_f, _s, _n, _v, vmstate_info_uint32, uint32_t)
+
+#define VMSTATE_UINT32_ARRAY(_f, _s, _n)                              \
+    VMSTATE_UINT32_ARRAY_V(_f, _s, _n, 0)
+
+#define VMSTATE_UINT64_ARRAY_V(_f, _s, _n, _v)                        \
+    VMSTATE_ARRAY(_f, _s, _n, _v, vmstate_info_uint64, uint64_t)
+
+#define VMSTATE_UINT64_ARRAY(_f, _s, _n)                              \
+    VMSTATE_UINT64_ARRAY_V(_f, _s, _n, 0)
+
+#define VMSTATE_INT16_ARRAY_V(_f, _s, _n, _v)                         \
+    VMSTATE_ARRAY(_f, _s, _n, _v, vmstate_info_int16, int16_t)
+
+#define VMSTATE_INT16_ARRAY(_f, _s, _n)                               \
+    VMSTATE_INT16_ARRAY_V(_f, _s, _n, 0)
+
+#define VMSTATE_INT32_ARRAY_V(_f, _s, _n, _v)                         \
+    VMSTATE_ARRAY(_f, _s, _n, _v, vmstate_info_int32, int32_t)
+
+#define VMSTATE_INT32_ARRAY(_f, _s, _n)                               \
+    VMSTATE_INT32_ARRAY_V(_f, _s, _n, 0)
+
+#define VMSTATE_UINT32_SUB_ARRAY(_f, _s, _start, _num)                \
+    VMSTATE_SUB_ARRAY(_f, _s, _start, _num, 0, vmstate_info_uint32, uint32_t)
+
+#define VMSTATE_UINT32_ARRAY(_f, _s, _n)                              \
+    VMSTATE_UINT32_ARRAY_V(_f, _s, _n, 0)
+
+#define VMSTATE_INT64_ARRAY_V(_f, _s, _n, _v)                         \
+    VMSTATE_ARRAY(_f, _s, _n, _v, vmstate_info_int64, int64_t)
+
+#define VMSTATE_INT64_ARRAY(_f, _s, _n)                               \
+    VMSTATE_INT64_ARRAY_V(_f, _s, _n, 0)
+
+#define VMSTATE_FLOAT64_ARRAY_V(_f, _s, _n, _v)                       \
+    VMSTATE_ARRAY(_f, _s, _n, _v, vmstate_info_float64, float64)
+
+#define VMSTATE_FLOAT64_ARRAY(_f, _s, _n)                             \
+    VMSTATE_FLOAT64_ARRAY_V(_f, _s, _n, 0)
+
+#define VMSTATE_BUFFER_V(_f, _s, _v)                                  \
+    VMSTATE_STATIC_BUFFER(_f, _s, _v, NULL, 0, sizeof(typeof_field(_s, _f)))
+
+#define VMSTATE_BUFFER(_f, _s)                                        \
+    VMSTATE_BUFFER_V(_f, _s, 0)
+
+#define VMSTATE_PARTIAL_BUFFER(_f, _s, _size)                         \
+    VMSTATE_STATIC_BUFFER(_f, _s, 0, NULL, 0, _size)
+
+#define VMSTATE_BUFFER_START_MIDDLE(_f, _s, _start) \
+    VMSTATE_STATIC_BUFFER(_f, _s, 0, NULL, _start, sizeof(typeof_field(_s, _f)))
+
+#define VMSTATE_PARTIAL_VBUFFER(_f, _s, _size)                        \
+    VMSTATE_VBUFFER(_f, _s, 0, NULL, 0, _size)
+
+#define VMSTATE_PARTIAL_VBUFFER_UINT32(_f, _s, _size)                        \
+    VMSTATE_VBUFFER_UINT32(_f, _s, 0, NULL, 0, _size)
+
+#define VMSTATE_SUB_VBUFFER(_f, _s, _start, _size)                    \
+    VMSTATE_VBUFFER(_f, _s, 0, NULL, _start, _size)
+
+#define VMSTATE_BUFFER_TEST(_f, _s, _test)                            \
+    VMSTATE_STATIC_BUFFER(_f, _s, 0, _test, 0, sizeof(typeof_field(_s, _f)))
+
+#define VMSTATE_BUFFER_UNSAFE(_field, _state, _version, _size)        \
+    VMSTATE_BUFFER_UNSAFE_INFO(_field, _state, _version, vmstate_info_buffer, _size)
+
+#define VMSTATE_UNUSED_V(_v, _size)                                   \
+    VMSTATE_UNUSED_BUFFER(NULL, _v, _size)
+
+#define VMSTATE_UNUSED(_size)                                         \
+    VMSTATE_UNUSED_V(0, _size)
+
+#define VMSTATE_UNUSED_TEST(_test, _size)                             \
+    VMSTATE_UNUSED_BUFFER(_test, 0, _size)
+
+#define VMSTATE_END_OF_LIST()                                         \
+    {}
+
+int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
+                       void *opaque, int version_id);
+void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd,
+                        void *opaque);
+
+int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
+                                   const VMStateDescription *vmsd,
+                                   void *base, int alias_id,
+                                   int required_for_version);
+
+static inline int vmstate_register(DeviceState *dev, int instance_id,
+                                   const VMStateDescription *vmsd,
+                                   void *opaque)
+{
+    return vmstate_register_with_alias_id(dev, instance_id, vmsd,
+                                          opaque, -1, 0);
+}
+
+void vmstate_unregister(DeviceState *dev, const VMStateDescription *vmsd,
+                        void *opaque);
+
+struct MemoryRegion;
+void vmstate_register_ram(struct MemoryRegion *memory, DeviceState *dev);
+void vmstate_unregister_ram(struct MemoryRegion *memory, DeviceState *dev);
+void vmstate_register_ram_global(struct MemoryRegion *memory);
+
+#endif
diff --git a/contrib/qemu/include/monitor/monitor.h b/contrib/qemu/include/monitor/monitor.h
new file mode 100644
index 000000000..1942cc42f
--- /dev/null
+++ b/contrib/qemu/include/monitor/monitor.h
@@ -0,0 +1,104 @@
+#ifndef MONITOR_H
+#define MONITOR_H
+
+#include "qemu-common.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qdict.h"
+#include "block/block.h"
+#include "monitor/readline.h"
+
+extern Monitor *cur_mon;
+extern Monitor *default_mon;
+
+/* flags for monitor_init */
+#define MONITOR_IS_DEFAULT    0x01
+#define MONITOR_USE_READLINE  0x02
+#define MONITOR_USE_CONTROL   0x04
+#define MONITOR_USE_PRETTY    0x08
+
+/* flags for monitor commands */
+#define MONITOR_CMD_ASYNC       0x0001
+
+/* QMP events */
+typedef enum MonitorEvent {
+    QEVENT_SHUTDOWN,
+    QEVENT_RESET,
+    QEVENT_POWERDOWN,
+    QEVENT_STOP,
+    QEVENT_RESUME,
+    QEVENT_VNC_CONNECTED,
+    QEVENT_VNC_INITIALIZED,
+    QEVENT_VNC_DISCONNECTED,
+    QEVENT_BLOCK_IO_ERROR,
+    QEVENT_RTC_CHANGE,
+    QEVENT_WATCHDOG,
+    QEVENT_SPICE_CONNECTED,
+    QEVENT_SPICE_INITIALIZED,
+    QEVENT_SPICE_DISCONNECTED,
+    QEVENT_BLOCK_JOB_COMPLETED,
+    QEVENT_BLOCK_JOB_CANCELLED,
+    QEVENT_BLOCK_JOB_ERROR,
+    QEVENT_BLOCK_JOB_READY,
+    QEVENT_DEVICE_DELETED,
+    QEVENT_DEVICE_TRAY_MOVED,
+    QEVENT_NIC_RX_FILTER_CHANGED,
+    QEVENT_SUSPEND,
+    QEVENT_SUSPEND_DISK,
+    QEVENT_WAKEUP,
+    QEVENT_BALLOON_CHANGE,
+    QEVENT_SPICE_MIGRATE_COMPLETED,
+    QEVENT_GUEST_PANICKED,
+
+    /* Add to 'monitor_event_names' array in monitor.c when
+     * defining new events here */
+
+    QEVENT_MAX,
+} MonitorEvent;
+
+int monitor_cur_is_qmp(void);
+
+void monitor_protocol_event(MonitorEvent event, QObject *data);
+void monitor_init(CharDriverState *chr, int flags);
+
+int monitor_suspend(Monitor *mon);
+void monitor_resume(Monitor *mon);
+
+int monitor_read_bdrv_key_start(Monitor *mon, BlockDriverState *bs,
+                                BlockDriverCompletionFunc *completion_cb,
+                                void *opaque);
+int monitor_read_block_device_key(Monitor *mon, const char *device,
+                                  BlockDriverCompletionFunc *completion_cb,
+                                  void *opaque);
+
+int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp);
+int monitor_handle_fd_param(Monitor *mon, const char *fdname);
+
+void monitor_vprintf(Monitor *mon, const char *fmt, va_list ap)
+    GCC_FMT_ATTR(2, 0);
+void monitor_printf(Monitor *mon, const char *fmt, ...) GCC_FMT_ATTR(2, 3);
+void monitor_print_filename(Monitor *mon, const char *filename);
+void monitor_flush(Monitor *mon);
+int monitor_set_cpu(int cpu_index);
+int monitor_get_cpu_index(void);
+
+typedef void (MonitorCompletion)(void *opaque, QObject *ret_data);
+
+void monitor_set_error(Monitor *mon, QError *qerror);
+void monitor_read_command(Monitor *mon, int show_prompt);
+ReadLineState *monitor_get_rs(Monitor *mon);
+int monitor_read_password(Monitor *mon, ReadLineFunc *readline_func,
+                          void *opaque);
+
+int qmp_qom_set(Monitor *mon, const QDict *qdict, QObject **ret);
+
+int qmp_qom_get(Monitor *mon, const QDict *qdict, QObject **ret);
+
+AddfdInfo *monitor_fdset_add_fd(int fd, bool has_fdset_id, int64_t fdset_id,
+                                bool has_opaque, const char *opaque,
+                                Error **errp);
+int monitor_fdset_get_fd(int64_t fdset_id, int flags);
+int monitor_fdset_dup_fd_add(int64_t fdset_id, int dup_fd);
+int monitor_fdset_dup_fd_remove(int dup_fd);
+int monitor_fdset_dup_fd_find(int dup_fd);
+
+#endif /* !MONITOR_H */
diff --git a/contrib/qemu/include/monitor/readline.h b/contrib/qemu/include/monitor/readline.h
new file mode 100644
index 000000000..fc9806ecf
--- /dev/null
+++ b/contrib/qemu/include/monitor/readline.h
@@ -0,0 +1,55 @@
+#ifndef READLINE_H
+#define READLINE_H
+
+#include "qemu-common.h"
+
+#define READLINE_CMD_BUF_SIZE 4095
+#define READLINE_MAX_CMDS 64
+#define READLINE_MAX_COMPLETIONS 256
+
+typedef void ReadLineFunc(Monitor *mon, const char *str, void *opaque);
+typedef void ReadLineCompletionFunc(const char *cmdline);
+
+typedef struct ReadLineState {
+    char cmd_buf[READLINE_CMD_BUF_SIZE + 1];
+    int cmd_buf_index;
+    int cmd_buf_size;
+
+    char last_cmd_buf[READLINE_CMD_BUF_SIZE + 1];
+    int last_cmd_buf_index;
+    int last_cmd_buf_size;
+
+    int esc_state;
+    int esc_param;
+
+    char *history[READLINE_MAX_CMDS];
+    int hist_entry;
+
+    ReadLineCompletionFunc *completion_finder;
+    char *completions[READLINE_MAX_COMPLETIONS];
+    int nb_completions;
+    int completion_index;
+
+    ReadLineFunc *readline_func;
+    void *readline_opaque;
+    int read_password;
+    char prompt[256];
+    Monitor *mon;
+} ReadLineState;
+
+void readline_add_completion(ReadLineState *rs, const char *str);
+void readline_set_completion_index(ReadLineState *rs, int completion_index);
+
+const char *readline_get_history(ReadLineState *rs, unsigned int index);
+
+void readline_handle_byte(ReadLineState *rs, int ch);
+
+void readline_start(ReadLineState *rs, const char *prompt, int read_password,
+                    ReadLineFunc *readline_func, void *opaque);
+void readline_restart(ReadLineState *rs);
+void readline_show_prompt(ReadLineState *rs);
+
+ReadLineState *readline_init(Monitor *mon,
+                             ReadLineCompletionFunc *completion_finder);
+
+#endif /* !READLINE_H */
diff --git a/contrib/qemu/include/qapi/error.h b/contrib/qemu/include/qapi/error.h
new file mode 100644
index 000000000..ffd1cea47
--- /dev/null
+++ b/contrib/qemu/include/qapi/error.h
@@ -0,0 +1,85 @@
+/*
+ * QEMU Error Objects
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.  See
+ * the COPYING.LIB file in the top-level directory.
+ */
+#ifndef ERROR_H
+#define ERROR_H
+
+#include "qemu/compiler.h"
+#include "qapi-types.h"
+#include <stdbool.h>
+
+/**
+ * A class representing internal errors within QEMU.  An error has a ErrorClass
+ * code and a human message.
+ */
+typedef struct Error Error;
+
+/**
+ * Set an indirect pointer to an error given a ErrorClass value and a
+ * printf-style human message.  This function is not meant to be used outside
+ * of QEMU.
+ */
+void error_set(Error **err, ErrorClass err_class, const char *fmt, ...) GCC_FMT_ATTR(3, 4);
+
+/**
+ * Set an indirect pointer to an error given a ErrorClass value and a
+ * printf-style human message, followed by a strerror() string if
+ * @os_error is not zero.
+ */
+void error_set_errno(Error **err, int os_error, ErrorClass err_class, const char *fmt, ...) GCC_FMT_ATTR(4, 5);
+
+/**
+ * Same as error_set(), but sets a generic error
+ */
+#define error_setg(err, fmt, ...) \
+    error_set(err, ERROR_CLASS_GENERIC_ERROR, fmt, ## __VA_ARGS__)
+#define error_setg_errno(err, os_error, fmt, ...) \
+    error_set_errno(err, os_error, ERROR_CLASS_GENERIC_ERROR, fmt, ## __VA_ARGS__)
+
+/**
+ * Helper for open() errors
+ */
+void error_setg_file_open(Error **errp, int os_errno, const char *filename);
+
+/**
+ * Returns true if an indirect pointer to an error is pointing to a valid
+ * error object.
+ */
+bool error_is_set(Error **err);
+
+/*
+ * Get the error class of an error object.
+ */
+ErrorClass error_get_class(const Error *err);
+
+/**
+ * Returns an exact copy of the error passed as an argument.
+ */
+Error *error_copy(const Error *err);
+
+/**
+ * Get a human readable representation of an error object.
+ */
+const char *error_get_pretty(Error *err);
+
+/**
+ * Propagate an error to an indirect pointer to an error.  This function will
+ * always transfer ownership of the error reference and handles the case where
+ * dst_err is NULL correctly.  Errors after the first are discarded.
+ */
+void error_propagate(Error **dst_err, Error *local_err);
+
+/**
+ * Free an error object.
+ */
+void error_free(Error *err);
+
+#endif
diff --git a/contrib/qemu/include/qapi/qmp/json-lexer.h b/contrib/qemu/include/qapi/qmp/json-lexer.h
new file mode 100644
index 000000000..cdff0460a
--- /dev/null
+++ b/contrib/qemu/include/qapi/qmp/json-lexer.h
@@ -0,0 +1,51 @@
+/*
+ * JSON lexer
+ *
+ * Copyright IBM, Corp. 2009
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_JSON_LEXER_H
+#define QEMU_JSON_LEXER_H
+
+#include "qapi/qmp/qstring.h"
+#include "qapi/qmp/qlist.h"
+
+typedef enum json_token_type {
+    JSON_OPERATOR = 100,
+    JSON_INTEGER,
+    JSON_FLOAT,
+    JSON_KEYWORD,
+    JSON_STRING,
+    JSON_ESCAPE,
+    JSON_SKIP,
+    JSON_ERROR,
+} JSONTokenType;
+
+typedef struct JSONLexer JSONLexer;
+
+typedef void (JSONLexerEmitter)(JSONLexer *, QString *, JSONTokenType, int x, int y);
+
+struct JSONLexer
+{
+    JSONLexerEmitter *emit;
+    int state;
+    QString *token;
+    int x, y;
+};
+
+void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func);
+
+int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size);
+
+int json_lexer_flush(JSONLexer *lexer);
+
+void json_lexer_destroy(JSONLexer *lexer);
+
+#endif
diff --git a/contrib/qemu/include/qapi/qmp/json-parser.h b/contrib/qemu/include/qapi/qmp/json-parser.h
new file mode 100644
index 000000000..44d88f346
--- /dev/null
+++