diff options
Diffstat (limited to 'doc')
121 files changed, 8984 insertions, 2617 deletions
diff --git a/doc/Makefile.am b/doc/Makefile.am index c5fd2b81a..1103b607d 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -1,13 +1,6 @@ -EXTRA_DIST = glusterfs.vol.sample glusterfsd.vol.sample glusterfs.8 mount.glusterfs.8\ - porting_guide.txt authentication.txt coding-standard.pdf get_put_api_using_xattr.txt \ - translator-options.txt mac-related-xattrs.txt replicate.pdf glusterd.vol gluster.8 \ - glusterd.8 glusterfsd.8 - -SUBDIRS = examples hacker-guide - -voldir = $(sysconfdir)/glusterfs -vol_DATA = glusterfs.vol.sample glusterfsd.vol.sample glusterd.vol +EXTRA_DIST = glusterfs.8 mount.glusterfs.8 gluster.8 \ + glusterd.8 glusterfsd.8 man8_MANS = glusterfs.8 mount.glusterfs.8 gluster.8 glusterd.8 glusterfsd.8 -CLEANFILES = +CLEANFILES = diff --git a/doc/admin-guide/en-US/images/640px-GlusterFS_Architecture.png b/doc/admin-guide/en-US/images/640px-GlusterFS_Architecture.png Binary files differnew file mode 100644 index 000000000..95f89ec82 --- /dev/null +++ b/doc/admin-guide/en-US/images/640px-GlusterFS_Architecture.png diff --git a/doc/admin-guide/en-US/images/Distributed_Replicated_Volume.png b/doc/admin-guide/en-US/images/Distributed_Replicated_Volume.png Binary files differnew file mode 100644 index 000000000..dfc0a2c56 --- /dev/null +++ b/doc/admin-guide/en-US/images/Distributed_Replicated_Volume.png diff --git a/doc/admin-guide/en-US/images/Distributed_Striped_Replicated_Volume.png b/doc/admin-guide/en-US/images/Distributed_Striped_Replicated_Volume.png Binary files differnew file mode 100644 index 000000000..d286fa99e --- /dev/null +++ b/doc/admin-guide/en-US/images/Distributed_Striped_Replicated_Volume.png diff --git a/doc/admin-guide/en-US/images/Distributed_Striped_Volume.png b/doc/admin-guide/en-US/images/Distributed_Striped_Volume.png Binary files differnew file mode 100644 index 000000000..752fa982f --- /dev/null +++ b/doc/admin-guide/en-US/images/Distributed_Striped_Volume.png diff --git a/doc/admin-guide/en-US/images/Distributed_Volume.png b/doc/admin-guide/en-US/images/Distributed_Volume.png Binary files differnew file mode 100644 index 000000000..4386ca935 --- /dev/null +++ b/doc/admin-guide/en-US/images/Distributed_Volume.png diff --git a/doc/admin-guide/en-US/images/Geo-Rep03_Internet.png b/doc/admin-guide/en-US/images/Geo-Rep03_Internet.png Binary files differnew file mode 100644 index 000000000..3cd0eaded --- /dev/null +++ b/doc/admin-guide/en-US/images/Geo-Rep03_Internet.png diff --git a/doc/admin-guide/en-US/images/Geo-Rep04_Cascading.png b/doc/admin-guide/en-US/images/Geo-Rep04_Cascading.png Binary files differnew file mode 100644 index 000000000..54bf9f05c --- /dev/null +++ b/doc/admin-guide/en-US/images/Geo-Rep04_Cascading.png diff --git a/doc/admin-guide/en-US/images/Geo-Rep_LAN.png b/doc/admin-guide/en-US/images/Geo-Rep_LAN.png Binary files differnew file mode 100644 index 000000000..a74f6dbb5 --- /dev/null +++ b/doc/admin-guide/en-US/images/Geo-Rep_LAN.png diff --git a/doc/admin-guide/en-US/images/Geo-Rep_WAN.png b/doc/admin-guide/en-US/images/Geo-Rep_WAN.png Binary files differnew file mode 100644 index 000000000..d72d72768 --- /dev/null +++ b/doc/admin-guide/en-US/images/Geo-Rep_WAN.png diff --git a/doc/admin-guide/en-US/images/GlusterFS_Architecture.png b/doc/admin-guide/en-US/images/GlusterFS_Architecture.png Binary files differnew file mode 100644 index 000000000..b506db1f4 --- /dev/null +++ b/doc/admin-guide/en-US/images/GlusterFS_Architecture.png diff --git a/doc/admin-guide/en-US/images/Hadoop_Architecture.png b/doc/admin-guide/en-US/images/Hadoop_Architecture.png Binary files differnew file mode 100644 index 000000000..8725bd330 --- /dev/null +++ b/doc/admin-guide/en-US/images/Hadoop_Architecture.png diff --git a/doc/admin-guide/en-US/images/Replicated_Volume.png b/doc/admin-guide/en-US/images/Replicated_Volume.png Binary files differnew file mode 100644 index 000000000..135a63f34 --- /dev/null +++ b/doc/admin-guide/en-US/images/Replicated_Volume.png diff --git a/doc/admin-guide/en-US/images/Striped_Replicated_Volume.png b/doc/admin-guide/en-US/images/Striped_Replicated_Volume.png Binary files differnew file mode 100644 index 000000000..ee88af731 --- /dev/null +++ b/doc/admin-guide/en-US/images/Striped_Replicated_Volume.png diff --git a/doc/admin-guide/en-US/images/Striped_Volume.png b/doc/admin-guide/en-US/images/Striped_Volume.png Binary files differnew file mode 100644 index 000000000..63a84b242 --- /dev/null +++ b/doc/admin-guide/en-US/images/Striped_Volume.png diff --git a/doc/admin-guide/en-US/images/UFO_Architecture.png b/doc/admin-guide/en-US/images/UFO_Architecture.png Binary files differnew file mode 100644 index 000000000..be85d7b28 --- /dev/null +++ b/doc/admin-guide/en-US/images/UFO_Architecture.png diff --git a/doc/admin-guide/en-US/images/VSA_Architecture.png b/doc/admin-guide/en-US/images/VSA_Architecture.png Binary files differnew file mode 100644 index 000000000..c3ab80cf3 --- /dev/null +++ b/doc/admin-guide/en-US/images/VSA_Architecture.png diff --git a/doc/admin-guide/en-US/images/icon.svg b/doc/admin-guide/en-US/images/icon.svg new file mode 100644 index 000000000..b2f16d0f6 --- /dev/null +++ b/doc/admin-guide/en-US/images/icon.svg @@ -0,0 +1,19 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<svg xmlns:svg="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" width="32" height="32" id="svg3017"> + <defs id="defs3019"> + <linearGradient id="linearGradient2381"> + <stop id="stop2383" style="stop-color:#ffffff;stop-opacity:1" offset="0"/> + <stop id="stop2385" style="stop-color:#ffffff;stop-opacity:0" offset="1"/> + </linearGradient> + <linearGradient x1="296.4996" y1="188.81061" x2="317.32471" y2="209.69398" id="linearGradient2371" xlink:href="#linearGradient2381" gradientUnits="userSpaceOnUse" gradientTransform="matrix(0.90776,0,0,0.90776,24.35648,49.24131)"/> + </defs> + <g transform="matrix(0.437808,-0.437808,0.437808,0.437808,-220.8237,43.55311)" id="g5089"> + <path d="m 8.4382985,-6.28125 c -0.6073916,0 -4.3132985,5.94886271 -4.3132985,8.25 l 0,26.71875 c 0,0.846384 0.5818159,1.125 1.15625,1.125 l 25.5625,0 c 0.632342,0 1.125001,-0.492658 1.125,-1.125 l 0,-5.21875 0.28125,0 c 0.49684,0 0.906249,-0.409411 0.90625,-0.90625 l 0,-27.9375 c 0,-0.4968398 -0.40941,-0.90625 -0.90625,-0.90625 l -23.8117015,0 z" transform="translate(282.8327,227.1903)" id="path5091" style="fill:#5c5c4f;stroke:#000000;stroke-width:3.23021388;stroke-miterlimit:4;stroke-dasharray:none"/> + <rect width="27.85074" height="29.369793" rx="1.1414107" ry="1.1414107" x="286.96509" y="227.63805" id="rect5093" style="fill:#032c87"/> + <path d="m 288.43262,225.43675 25.2418,0 0,29.3698 -26.37615,0.0241 1.13435,-29.39394 z" id="rect5095" style="fill:#ffffff"/> + <path d="m 302.44536,251.73726 c 1.38691,7.85917 -0.69311,11.28365 -0.69311,11.28365 2.24384,-1.60762 3.96426,-3.47694 4.90522,-5.736 0.96708,2.19264 1.83294,4.42866 4.27443,5.98941 0,0 -1.59504,-7.2004 -1.71143,-11.53706 l -6.77511,0 z" id="path5097" style="fill:#a70000;fill-opacity:1;stroke-width:2"/> + <rect width="25.241802" height="29.736675" rx="0.89682275" ry="0.89682275" x="290.73544" y="220.92249" id="rect5099" style="fill:#809cc9"/> + <path d="m 576.47347,725.93939 6.37084,0.41502 0.4069,29.51809 c -1.89202,-1.31785 -6.85427,-3.7608 -8.26232,-1.68101 l 0,-26.76752 c 0,-0.82246 0.66212,-1.48458 1.48458,-1.48458 z" transform="matrix(0.499065,-0.866565,0,1,0,0)" id="rect5101" style="fill:#4573b3;fill-opacity:1"/> + <path d="m 293.2599,221.89363 20.73918,0 c 0.45101,0 0.8141,0.3631 0.8141,0.81411 0.21547,6.32836 -19.36824,21.7635 -22.36739,17.59717 l 0,-17.59717 c 0,-0.45101 0.3631,-0.81411 0.81411,-0.81411 z" id="path5103" style="opacity:0.65536726;fill:url(#linearGradient2371);fill-opacity:1"/> + </g> +</svg> diff --git a/doc/admin-guide/en-US/markdown/Administration_Guide.md b/doc/admin-guide/en-US/markdown/Administration_Guide.md new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/Administration_Guide.md @@ -0,0 +1 @@ + diff --git a/doc/admin-guide/en-US/markdown/Author_Group.md b/doc/admin-guide/en-US/markdown/Author_Group.md new file mode 100644 index 000000000..ef2a5e677 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/Author_Group.md @@ -0,0 +1,5 @@ +Divya +Muntimadugu +Red Hat +Engineering Content Services +divya@redhat.com diff --git a/doc/admin-guide/en-US/markdown/Book_Info.md b/doc/admin-guide/en-US/markdown/Book_Info.md new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/Book_Info.md @@ -0,0 +1 @@ + diff --git a/doc/admin-guide/en-US/markdown/Chapter.md b/doc/admin-guide/en-US/markdown/Chapter.md new file mode 100644 index 000000000..8420259c4 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/Chapter.md @@ -0,0 +1,18 @@ +Test Chapter +============ + +This is a test paragraph + +Test Section 1 +============== + +This is a test paragraph in a section + +Test Section 2 +============== + +This is a test paragraph in Section 2 + +1. listitem text + + diff --git a/doc/admin-guide/en-US/markdown/Preface.md b/doc/admin-guide/en-US/markdown/Preface.md new file mode 100644 index 000000000..f7e934ae8 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/Preface.md @@ -0,0 +1,22 @@ +Preface +======= + +This guide describes how to configure, operate, and manage Gluster File +System (GlusterFS). + +Audience +======== + +This guide is intended for Systems Administrators interested in +configuring and managing GlusterFS. + +This guide assumes that you are familiar with the Linux operating +system, concepts of File System, GlusterFS concepts, and GlusterFS +Installation + +License +======= + +The License information is available at [][]. + + []: http://www.redhat.com/licenses/rhel_rha_eula.html diff --git a/doc/admin-guide/en-US/markdown/Revision_History.md b/doc/admin-guide/en-US/markdown/Revision_History.md new file mode 100644 index 000000000..2084309d1 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/Revision_History.md @@ -0,0 +1,4 @@ +Revision History +================ + +1-0 Thu Apr 5 2012 Divya Muntimadugu <divya@redhat.com> Draft diff --git a/doc/admin-guide/en-US/markdown/admin_ACLs.md b/doc/admin-guide/en-US/markdown/admin_ACLs.md new file mode 100644 index 000000000..308e069ca --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_ACLs.md @@ -0,0 +1,197 @@ +POSIX Access Control Lists +========================== + +POSIX Access Control Lists (ACLs) allows you to assign different +permissions for different users or groups even though they do not +correspond to the original owner or the owning group. + +For example: User john creates a file but does not want to allow anyone +to do anything with this file, except another user, antony (even though +there are other users that belong to the group john). + +This means, in addition to the file owner, the file group, and others, +additional users and groups can be granted or denied access by using +POSIX ACLs. + +Activating POSIX ACLs Support +============================= + +To use POSIX ACLs for a file or directory, the partition of the file or +directory must be mounted with POSIX ACLs support. + +Activating POSIX ACLs Support on Sever +-------------------------------------- + +To mount the backend export directories for POSIX ACLs support, use the +following command: + +`# mount -o acl ` + +For example: + +`# mount -o acl /dev/sda1 /export1 ` + +Alternatively, if the partition is listed in the /etc/fstab file, add +the following entry for the partition to include the POSIX ACLs option: + +`LABEL=/work /export1 ext3 rw, acl 14 ` + +Activating POSIX ACLs Support on Client +--------------------------------------- + +To mount the glusterfs volumes for POSIX ACLs support, use the following +command: + +`# mount –t glusterfs -o acl ` + +For example: + +`# mount -t glusterfs -o acl 198.192.198.234:glustervolume /mnt/gluster` + +Setting POSIX ACLs +================== + +You can set two types of POSIX ACLs, that is, access ACLs and default +ACLs. You can use access ACLs to grant permission for a specific file or +directory. You can use default ACLs only on a directory but if a file +inside that directory does not have an ACLs, it inherits the permissions +of the default ACLs of the directory. + +You can set ACLs for per user, per group, for users not in the user +group for the file, and via the effective right mask. + +Setting Access ACLs +------------------- + +You can apply access ACLs to grant permission for both files and +directories. + +**To set or modify Access ACLs** + +You can set or modify access ACLs use the following command: + +`# setfacl –m file ` + +The ACL entry types are the POSIX ACLs representations of owner, group, +and other. + +Permissions must be a combination of the characters `r` (read), `w` +(write), and `x` (execute). You must specify the ACL entry in the +following format and can specify multiple entry types separated by +commas. + + ACL Entry Description + ---------------------- -------------------------------------------------------------------------------------------------------------------------------------------------- + u:uid:\<permission\> Sets the access ACLs for a user. You can specify user name or UID + g:gid:\<permission\> Sets the access ACLs for a group. You can specify group name or GID. + m:\<permission\> Sets the effective rights mask. The mask is the combination of all access permissions of the owning group and all of the user and group entries. + o:\<permission\> Sets the access ACLs for users other than the ones in the group for the file. + +If a file or directory already has an POSIX ACLs, and the setfacl +command is used, the additional permissions are added to the existing +POSIX ACLs or the existing rule is modified. + +For example, to give read and write permissions to user antony: + +`# setfacl -m u:antony:rw /mnt/gluster/data/testfile ` + +Setting Default ACLs +-------------------- + +You can apply default ACLs only to directories. They determine the +permissions of a file system objects that inherits from its parent +directory when it is created. + +To set default ACLs + +You can set default ACLs for files and directories using the following +command: + +`# setfacl –m –-set ` + +For example, to set the default ACLs for the /data directory to read for +users not in the user group: + +`# setfacl –m --set o::r /mnt/gluster/data ` + +> **Note** +> +> An access ACLs set for an individual file can override the default +> ACLs permissions. + +**Effects of a Default ACLs** + +The following are the ways in which the permissions of a directory's +default ACLs are passed to the files and subdirectories in it: + +- A subdirectory inherits the default ACLs of the parent directory + both as its default ACLs and as an access ACLs. + +- A file inherits the default ACLs as its access ACLs. + +Retrieving POSIX ACLs +===================== + +You can view the existing POSIX ACLs for a file or directory. + +**To view existing POSIX ACLs** + +- View the existing access ACLs of a file using the following command: + + `# getfacl ` + + For example, to view the existing POSIX ACLs for sample.jpg + + # getfacl /mnt/gluster/data/test/sample.jpg + # owner: antony + # group: antony + user::rw- + group::rw- + other::r-- + +- View the default ACLs of a directory using the following command: + + `# getfacl ` + + For example, to view the existing ACLs for /data/doc + + # getfacl /mnt/gluster/data/doc + # owner: antony + # group: antony + user::rw- + user:john:r-- + group::r-- + mask::r-- + other::r-- + default:user::rwx + default:user:antony:rwx + default:group::r-x + default:mask::rwx + default:other::r-x + +Removing POSIX ACLs +=================== + +To remove all the permissions for a user, groups, or others, use the +following command: + +`# setfacl -x ` + +For example, to remove all permissions from the user antony: + +`# setfacl -x u:antony /mnt/gluster/data/test-file` + +Samba and ACLs +============== + +If you are using Samba to access GlusterFS FUSE mount, then POSIX ACLs +are enabled by default. Samba has been compiled with the +`--with-acl-support` option, so no special flags are required when +accessing or mounting a Samba share. + +NFS and ACLs +============ + +Currently we do not support ACLs configuration through NFS, i.e. setfacl +and getfacl commands do not work. However, ACLs permissions set using +Gluster Native Client is applicable on NFS mounts. diff --git a/doc/admin-guide/en-US/markdown/admin_Hadoop.md b/doc/admin-guide/en-US/markdown/admin_Hadoop.md new file mode 100644 index 000000000..2894fa713 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_Hadoop.md @@ -0,0 +1,170 @@ +Managing Hadoop Compatible Storage +================================== + +GlusterFS provides compatibility for Apache Hadoop and it uses the +standard file system APIs available in Hadoop to provide a new storage +option for Hadoop deployments. Existing MapReduce based applications can +use GlusterFS seamlessly. This new functionality opens up data within +Hadoop deployments to any file-based or object-based application. + +Architecture Overview +===================== + +The following diagram illustrates Hadoop integration with GlusterFS: + +Advantages +========== + +The following are the advantages of Hadoop Compatible Storage with +GlusterFS: + +- Provides simultaneous file-based and object-based access within + Hadoop. + +- Eliminates the centralized metadata server. + +- Provides compatibility with MapReduce applications and rewrite is + not required. + +- Provides a fault tolerant file system. + +Preparing to Install Hadoop Compatible Storage +============================================== + +This section provides information on pre-requisites and list of +dependencies that will be installed during installation of Hadoop +compatible storage. + +Pre-requisites +-------------- + +The following are the pre-requisites to install Hadoop Compatible +Storage : + +- Hadoop 0.20.2 is installed, configured, and is running on all the + machines in the cluster. + +- Java Runtime Environment + +- Maven (mandatory only if you are building the plugin from the + source) + +- JDK (mandatory only if you are building the plugin from the source) + +- getfattr - command line utility + +Installing, and Configuring Hadoop Compatible Storage +===================================================== + +This section describes how to install and configure Hadoop Compatible +Storage in your storage environment and verify that it is functioning +correctly. + +1. Download `glusterfs-hadoop-0.20.2-0.1.x86_64.rpm` file to each + server on your cluster. You can download the file from [][]. + +2. To install Hadoop Compatible Storage on all servers in your cluster, + run the following command: + + `# rpm –ivh --nodeps glusterfs-hadoop-0.20.2-0.1.x86_64.rpm` + + The following files will be extracted: + + - /usr/local/lib/glusterfs-Hadoop-version-gluster\_plugin\_version.jar + + - /usr/local/lib/conf/core-site.xml + +3. (Optional) To install Hadoop Compatible Storage in a different + location, run the following command: + + `# rpm –ivh --nodeps –prefix /usr/local/glusterfs/hadoop glusterfs-hadoop- 0.20.2-0.1.x86_64.rpm` + +4. Edit the `conf/core-site.xml` file. The following is the sample + `conf/core-site.xml` file: + + <configuration> + <property> + <name>fs.glusterfs.impl</name> + <value>org.apache.hadoop.fs.glusterfs.Gluster FileSystem</value> + </property> + + <property> + <name>fs.default.name</name> + <value>glusterfs://fedora1:9000</value> + </property> + + <property> + <name>fs.glusterfs.volname</name> + <value>hadoopvol</value> + </property> + + <property> + <name>fs.glusterfs.mount</name> + <value>/mnt/glusterfs</value> + </property> + + <property> + <name>fs.glusterfs.server</name> + <value>fedora2</value> + </property> + + <property> + <name>quick.slave.io</name> + <value>Off</value> + </property> + </configuration> + + The following are the configurable fields: + + ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Property Name Default Value Description + ---------------------- -------------------------- --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + fs.default.name glusterfs://fedora1:9000 Any hostname in the cluster as the server and any port number. + + fs.glusterfs.volname hadoopvol GlusterFS volume to mount. + + fs.glusterfs.mount /mnt/glusterfs The directory used to fuse mount the volume. + + fs.glusterfs.server fedora2 Any hostname or IP address on the cluster except the client/master. + + quick.slave.io Off Performance tunable option. If this option is set to On, the plugin will try to perform I/O directly from the disk file system (like ext3 or ext4) the file resides on. Hence read performance will improve and job would run faster. + > **Note** + > + > This option is not tested widely + ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +5. Create a soft link in Hadoop’s library and configuration directory + for the downloaded files (in Step 3) using the following commands: + + `# ln -s >` + + For example, + + `# ln –s /usr/local/lib/glusterfs-0.20.2-0.1.jar /lib/glusterfs-0.20.2-0.1.jar` + + `# ln –s /usr/local/lib/conf/core-site.xml /conf/core-site.xml ` + +6. (Optional) You can run the following command on Hadoop master to + build the plugin and deploy it along with core-site.xml file, + instead of repeating the above steps: + + `# build-deploy-jar.py -d -c ` + +Starting and Stopping the Hadoop MapReduce Daemon +================================================= + +To start and stop MapReduce daemon + +- To start MapReduce daemon manually, enter the following command: + + `# /bin/start-mapred.sh` + +- To stop MapReduce daemon manually, enter the following command: + + `# /bin/stop-mapred.sh ` + +> **Note** +> +> You must start Hadoop MapReduce daemon on all servers. + + []: http://download.gluster.com/pub/gluster/glusterfs/qa-releases/3.3-beta-2/glusterfs-hadoop-0.20.2-0.1.x86_64.rpm diff --git a/doc/admin-guide/en-US/markdown/admin_UFO.md b/doc/admin-guide/en-US/markdown/admin_UFO.md new file mode 100644 index 000000000..3311eff01 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_UFO.md @@ -0,0 +1,1219 @@ +Managing Unified File and Object Storage +======================================== + +Unified File and Object Storage (UFO) unifies NAS and object storage +technology. It provides a system for data storage that enables users to +access the same data, both as an object and as a file, thus simplifying +management and controlling storage costs. + +Unified File and Object Storage is built upon Openstack's Object Storage +Swift. Open Stack Object Storage allows users to store and retrieve +files and content through a simple Web Service (REST: Representational +State Transfer) interface as objects and GlusterFS, allows users to +store and retrieve files using Native Fuse and NFS mounts. It uses +GlusterFS as a backend file system for Open Stack Swift. It also +leverages on Open Stack Swift's web interface for storing and retrieving +files over the web combined with GlusterFS features like scalability and +high availability, replication, elastic volume management for data +management at disk level. + +Unified File and Object Storage technology enables enterprises to adopt +and deploy cloud storage solutions. It allows users to access and modify +data as objects from a REST interface along with the ability to access +and modify files from NAS interfaces including NFS and CIFS. In addition +to decreasing cost and making it faster and easier to access object +data, it also delivers massive scalability, high availability and +replication of object storage. Infrastructure as a Service (IaaS) +providers can utilize GlusterFS Unified File and Object Storage +technology to enable their own cloud storage service. Enterprises can +use this technology to accelerate the process of preparing file-based +applications for the cloud and simplify new application development for +cloud computing environments. + +OpenStack Object Storage is scalable object storage system and it is not +a traditional file system. You will not be able to mount this system +like traditional SAN or NAS volumes and perform POSIX compliant +operations. + +Components of Object Storage +============================ + +The major components of Object Storage are: + +**Proxy Server** + +All REST requests to the UFO are routed through the Proxy Server. + +**Objects and Containers** + +An object is the basic storage entity and any optional metadata that +represents the data you store. When you upload data, the data is stored +as-is (with no compression or encryption). + +A container is a storage compartment for your data and provides a way +for you to organize your data. Containers can be visualized as +directories in a Linux system. Data must be stored in a container and +hence objects are created within a container. + +It implements objects as files and directories under the container. The +object name is a '/' separated path and UFO maps it to directories until +the last name in the path, which is marked as a file. With this +approach, objects can be accessed as files and directories from native +GlusterFS (FUSE) or NFS mounts by providing the '/' separated path. + +**Accounts and Account Servers** + +The OpenStack Object Storage system is designed to be used by many +different storage consumers. Each user is associated with one or more +accounts and must identify themselves using an authentication system. +While authenticating, users must provide the name of the account for +which the authentication is requested. + +UFO implements accounts as GlusterFS volumes. So, when a user is granted +read/write permission on an account, it means that that user has access +to all the data available on that GlusterFS volume. + +**Authentication and Access Permissions** + +You must authenticate against an authentication service to receive +OpenStack Object Storage connection parameters and an authentication +token. The token must be passed in for all subsequent container or +object operations. One authentication service that you can use as a +middleware example is called `tempauth`. + +By default, each user has their own storage account and has full access +to that account. Users must authenticate with their credentials as +described above, but once authenticated they can manage containers and +objects within that account. If a user wants to access the content from +another account, they must have API access key or a session token +provided by their authentication system. + +Advantages of using GlusterFS Unified File and Object Storage +============================================================= + +The following are the advantages of using GlusterFS UFO: + +- No limit on upload and download files sizes as compared to Open + Stack Swift which limits the object size to 5GB. + +- A unified view of data across NAS and Object Storage technologies. + +- Using GlusterFS's UFO has other advantages like the following: + + - High availability + + - Scalability + + - Replication + + - Elastic Volume management + +Preparing to Deploy Unified File and Object Storage +=================================================== + +This section provides information on pre-requisites and list of +dependencies that will be installed during the installation of Unified +File and Object Storage. + +Pre-requisites +-------------- + +GlusterFS's Unified File and Object Storage needs `user_xattr` support +from the underlying disk file system. Use the following command to +enable `user_xattr` for GlusterFS bricks backend: + +`# mount –o remount,user_xattr ` + +For example, + +`# mount –o remount,user_xattr /dev/hda1 ` + +Dependencies +------------ + +The following packages are installed on GlusterFS when you install +Unified File and Object Storage: + +- curl + +- memcached + +- openssl + +- xfsprogs + +- python2.6 + +- pyxattr + +- python-configobj + +- python-setuptools + +- python-simplejson + +- python-webob + +- python-eventlet + +- python-greenlet + +- python-pastedeploy + +- python-netifaces + +Installing and Configuring Unified File and Object Storage +========================================================== + +This section provides instructions on how to install and configure +Unified File and Object Storage in your storage environment. + +Installing Unified File and Object Storage +------------------------------------------ + +To install Unified File and Object Storage: + +1. Download `rhel_install.sh` install script from [][] . + +2. Run `rhel_install.sh` script using the following command: + + `# sh rhel_install.sh` + +3. Download `swift-1.4.5-1.noarch.rpm` and + `swift-plugin-1.0.-1.el6.noarch.rpm` files from [][]. + +4. Install `swift-1.4.5-1.noarch.rpm` and + `swift-plugin-1.0.-1.el6.noarch.rpm` using the following commands: + + `# rpm -ivh swift-1.4.5-1.noarch.rpm` + + `# rpm -ivh swift-plugin-1.0.-1.el6.noarch.rpm` + + > **Note** + > + > You must repeat the above steps on all the machines on which you + > want to install Unified File and Object Storage. If you install + > the Unified File and Object Storage on multiple servers, you can + > use a load balancer like pound, nginx, and so on to distribute the + > request across the machines. + +Adding Users +------------ + +The authentication system allows the administrator to grant different +levels of access to different users based on the requirement. The +following are the types of user permissions: + +- admin user + +- normal user + +Admin user has read and write permissions on the account. By default, a +normal user has no read or write permissions. A normal user can only +authenticate itself to get a Auth-Token. Read or write permission are +provided through ACLs by the admin users. + +Add a new user by adding the following entry in +`/etc/swift/proxy-server.conf` file: + +`user_<account-name>_<user-name> = <password> [.admin]` + +For example, + +`user_test_tester = testing .admin` + +> **Note** +> +> During installation, the installation script adds few sample users to +> the `proxy-server.conf` file. It is highly recommended that you remove +> all the default sample user entries from the configuration file. + +For more information on setting ACLs, see ?. + +Configuring Proxy Server +------------------------ + +The Proxy Server is responsible for connecting to the rest of the +OpenStack Object Storage architecture. For each request, it looks up the +location of the account, container, or object in the ring and route the +request accordingly. The public API is also exposed through the proxy +server. When objects are streamed to or from an object server, they are +streamed directly through the proxy server to or from the user – the +proxy server does not spool them. + +The configurable options pertaining to proxy server are stored in +`/etc/swift/proxy-server.conf`. The following is the sample +`proxy-server.conf` file: + + [app:proxy-server] + use = egg:swift#proxy + allow_account_management=true + account_autocreate=true + + [filter:tempauth] + use = egg:swift#tempauth user_admin_admin=admin.admin.reseller_admin + user_test_tester=testing.admin + user_test2_tester2=testing2.admin + user_test_tester3=testing3 + + [filter:healthcheck] + use = egg:swift#healthcheck + + [filter:cache] + use = egg:swift#memcache + +By default, GlusterFS's Unified File and Object Storage is configured to +support HTTP protocol and uses temporary authentication to authenticate +the HTTP requests. + +Configuring Authentication System +--------------------------------- + +Proxy server must be configured to authenticate using ` + + `. + +Configuring Proxy Server for HTTPS +---------------------------------- + +By default, proxy server only handles HTTP request. To configure the +proxy server to process HTTPS requests, perform the following steps: + +1. Create self-signed cert for SSL using the following commands: + + cd /etc/swift + openssl req -new -x509 -nodes -out cert.crt -keyout cert.key + +2. Add the following lines to `/etc/swift/proxy-server.conf `under + [DEFAULT] + + bind_port = 443 + cert_file = /etc/swift/cert.crt + key_file = /etc/swift/cert.key + +3. Restart the servers using the following commands: + + swift-init main stop + swift-init main start + +The following are the configurable options: + + Option Default Description + ------------ ------------ ------------------------------- + bind\_ip 0.0.0.0 IP Address for server to bind + bind\_port 80 Port for server to bind + swift\_dir /etc/swift Swift configuration directory + workers 1 Number of workers to fork + user swift swift user + cert\_file Path to the ssl .crt + key\_file Path to the ssl .key + + : proxy-server.conf Default Options in the [DEFAULT] section + + Option Default Description + ------------------------------- ----------------- ----------------------------------------------------------------------------------------------------------- + use paste.deploy entry point for the container server. For most cases, this should be `egg:swift#container`. + log\_name proxy-server Label used when logging + log\_facility LOG\_LOCAL0 Syslog log facility + log\_level INFO Log level + log\_headers True If True, log headers in each request + recheck\_account\_existence 60 Cache timeout in seconds to send memcached for account existence + recheck\_container\_existence 60 Cache timeout in seconds to send memcached for container existence + object\_chunk\_size 65536 Chunk size to read from object servers + client\_chunk\_size 65536 Chunk size to read from clients + memcache\_servers 127.0.0.1:11211 Comma separated list of memcached servers ip:port + node\_timeout 10 Request timeout to external services + client\_timeout 60 Timeout to read one chunk from a client + conn\_timeout 0.5 Connection timeout to external services + error\_suppression\_interval 60 Time in seconds that must elapse since the last error for a node to be considered no longer error limited + error\_suppression\_limit 10 Error count to consider a node error limited + allow\_account\_management false Whether account `PUT`s and `DELETE`s are even callable + + : proxy-server.conf Server Options in the [proxy-server] section + +Configuring Object Server +------------------------- + +The Object Server is a very simple blob storage server that can store, +retrieve, and delete objects stored on local devices. Objects are stored +as binary files on the file system with metadata stored in the file’s +extended attributes (xattrs). This requires that the underlying file +system choice for object servers support xattrs on files. + +The configurable options pertaining Object Server are stored in the file +`/etc/swift/object-server/1.conf`. The following is the sample +`object-server/1.conf` file: + + [DEFAULT] + devices = /srv/1/node + mount_check = false + bind_port = 6010 + user = root + log_facility = LOG_LOCAL2 + + [pipeline:main] + pipeline = gluster object-server + + [app:object-server] + use = egg:swift#object + + [filter:gluster] + use = egg:swift#gluster + + [object-replicator] + vm_test_mode = yes + + [object-updater] + [object-auditor] + +The following are the configurable options: + + Option Default Description + -------------- ------------ ---------------------------------------------------------------------------------------------------- + swift\_dir /etc/swift Swift configuration directory + devices /srv/node Mount parent directory where devices are mounted + mount\_check true Whether or not check if the devices are mounted to prevent accidentally writing to the root device + bind\_ip 0.0.0.0 IP Address for server to bind + bind\_port 6000 Port for server to bind + workers 1 Number of workers to fork + + : object-server.conf Default Options in the [DEFAULT] section + + Option Default Description + ---------------------- --------------- ---------------------------------------------------------------------------------------------------- + use paste.deploy entry point for the object server. For most cases, this should be `egg:swift#object`. + log\_name object-server log name used when logging + log\_facility LOG\_LOCAL0 Syslog log facility + log\_level INFO Logging level + log\_requests True Whether or not to log each request + user swift swift user + node\_timeout 3 Request timeout to external services + conn\_timeout 0.5 Connection timeout to external services + network\_chunk\_size 65536 Size of chunks to read or write over the network + disk\_chunk\_size 65536 Size of chunks to read or write to disk + max\_upload\_time 65536 Maximum time allowed to upload an object + slow 0 If \> 0, Minimum time in seconds for a `PUT` or `DELETE` request to complete + + : object-server.conf Server Options in the [object-server] section + +Configuring Container Server +---------------------------- + +The Container Server’s primary job is to handle listings of objects. The +listing is done by querying the GlusterFS mount point with path. This +query returns a list of all files and directories present under that +container. + +The configurable options pertaining to container server are stored in +`/etc/swift/container-server/1.conf` file. The following is the sample +`container-server/1.conf` file: + + [DEFAULT] + devices = /srv/1/node + mount_check = false + bind_port = 6011 + user = root + log_facility = LOG_LOCAL2 + + [pipeline:main] + pipeline = gluster container-server + + [app:container-server] + use = egg:swift#container + + [filter:gluster] + use = egg:swift#gluster + + [container-replicator] + [container-updater] + [container-auditor] + +The following are the configurable options: + + Option Default Description + -------------- ------------ ---------------------------------------------------------------------------------------------------- + swift\_dir /etc/swift Swift configuration directory + devices /srv/node Mount parent directory where devices are mounted + mount\_check true Whether or not check if the devices are mounted to prevent accidentally writing to the root device + bind\_ip 0.0.0.0 IP Address for server to bind + bind\_port 6001 Port for server to bind + workers 1 Number of workers to fork + user swift Swift user + + : container-server.conf Default Options in the [DEFAULT] section + + Option Default Description + --------------- ------------------ ---------------------------------------------------------------------------------------------------------- + use paste.deploy entry point for the container server. For most cases, this should be `egg:swift#container`. + log\_name container-server Label used when logging + log\_facility LOG\_LOCAL0 Syslog log facility + log\_level INFO Logging level + node\_timeout 3 Request timeout to external services + conn\_timeout 0.5 Connection timeout to external services + + : container-server.conf Server Options in the [container-server] + section + +Configuring Account Server +-------------------------- + +The Account Server is very similar to the Container Server, except that +it is responsible for listing of containers rather than objects. In UFO, +each gluster volume is an account. + +The configurable options pertaining to account server are stored in +`/etc/swift/account-server/1.conf` file. The following is the sample +`account-server/1.conf` file: + + [DEFAULT] + devices = /srv/1/node + mount_check = false + bind_port = 6012 + user = root + log_facility = LOG_LOCAL2 + + [pipeline:main] + pipeline = gluster account-server + + [app:account-server] + use = egg:swift#account + + [filter:gluster] + use = egg:swift#gluster + + [account-replicator] + vm_test_mode = yes + + [account-auditor] + [account-reaper] + +The following are the configurable options: + + Option Default Description + -------------- ------------ ---------------------------------------------------------------------------------------------------- + swift\_dir /etc/swift Swift configuration directory + devices /srv/node mount parent directory where devices are mounted + mount\_check true Whether or not check if the devices are mounted to prevent accidentally writing to the root device + bind\_ip 0.0.0.0 IP Address for server to bind + bind\_port 6002 Port for server to bind + workers 1 Number of workers to fork + user swift Swift user + + : account-server.conf Default Options in the [DEFAULT] section + + Option Default Description + --------------- ---------------- ---------------------------------------------------------------------------------------------------------- + use paste.deploy entry point for the container server. For most cases, this should be `egg:swift#container`. + log\_name account-server Label used when logging + log\_facility LOG\_LOCAL0 Syslog log facility + log\_level INFO Logging level + + : account-server.conf Server Options in the [account-server] section + +Starting and Stopping Server +---------------------------- + +You must start the server manually when system reboots and whenever you +update/modify the configuration files. + +- To start the server, enter the following command: + + `# swift_init main start` + +- To stop the server, enter the following command: + + `# swift_init main stop` + +Working with Unified File and Object Storage +============================================ + +This section describes the REST API for administering and managing +Object Storage. All requests will be directed to the host and URL +described in the `X-Storage-URL HTTP` header obtained during successful +authentication. + +Configuring Authenticated Access +-------------------------------- + +Authentication is the process of proving identity to the system. To use +the REST interface, you must obtain an authorization token using GET +method and supply it with v1.0 as the path. + +Each REST request against the Object Storage system requires the +addition of a specific authorization token HTTP x-header, defined as +X-Auth-Token. The storage URL and authentication token are returned in +the headers of the response. + +- To authenticate, run the following command: + + GET auth/v1.0 HTTP/1.1 + Host: <auth URL> + X-Auth-User: <account name>:<user name> + X-Auth-Key: <user-Password> + + For example, + + GET auth/v1.0 HTTP/1.1 + Host: auth.example.com + X-Auth-User: test:tester + X-Auth-Key: testing + + HTTP/1.1 200 OK + X-Storage-Url: https:/example.storage.com:443/v1/AUTH_test + X-Storage-Token: AUTH_tkde3ad38b087b49bbbac0494f7600a554 + X-Auth-Token: AUTH_tkde3ad38b087b49bbbac0494f7600a554 + Content-Length: 0 + Date: Wed, 10 jul 2011 06:11:51 GMT + + To authenticate access using cURL (for the above example), run the + following command: + + curl -v -H 'X-Storage-User: test:tester' -H 'X-Storage-Pass:testing' -k + https://auth.example.com:443/auth/v1.0 + + The X-Auth-Url has to be parsed and used in the connection and + request line of all subsequent requests to the server. In the + example output, users connecting to server will send most + container/object requests with a host header of example.storage.com + and the request line's version and account as v1/AUTH\_test. + +> **Note** +> +> The authentication tokens are valid for a 24 hour period. + +Working with Accounts +--------------------- + +This section describes the list of operations you can perform at the +account level of the URL. + +### Displaying Container Information + +You can list the objects of a specific container, or all containers, as +needed using GET command. You can use the following optional parameters +with GET request to refine the results: + + Parameter Description + ----------- -------------------------------------------------------------------------- + limit Limits the number of results to at most *n* value. + marker Returns object names greater in value than the specified marker. + format Specify either json or xml to return the respective serialized response. + +**To display container information** + +- List all the containers of an account using the following command: + + GET /<apiversion>/<account> HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <authentication-token-key> + + For example, + + GET /v1/AUTH_test HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + + HTTP/1.1 200 Ok + Date: Wed, 13 Jul 2011 16:32:21 GMT + Server: Apache + Content-Type: text/plain; charset=UTF-8 + Content-Length: 39 + + songs + movies + documents + reports + +To display container information using cURL (for the above example), run +the following command: + + curl -v -X GET -H 'X-Auth-Token: AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test -k + +### Displaying Account Metadata Information + +You can issue HEAD command to the storage service to view the number of +containers and the total bytes stored in the account. + +- To display containers and storage used, run the following command: + + HEAD /<apiversion>/<account> HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <authentication-token-key> + + For example, + + HEAD /v1/AUTH_test HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + + HTTP/1.1 204 No Content + Date: Wed, 13 Jul 2011 16:52:21 GMT + Server: Apache + X-Account-Container-Count: 4 + X-Account-Total-Bytes-Used: 394792 + + To display account metadata information using cURL (for the above + example), run the following command: + + curl -v -X HEAD -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test -k + +Working with Containers +----------------------- + +This section describes the list of operations you can perform at the +container level of the URL. + +### Creating Containers + +You can use PUT command to create containers. Containers are the storage +folders for your data. The URL encoded name must be less than 256 bytes +and cannot contain a forward slash '/' character. + +- To create a container, run the following command: + + PUT /<apiversion>/<account>/<container>/ HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <authentication-token-key> + + For example, + + PUT /v1/AUTH_test/pictures/ HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + HTTP/1.1 201 Created + + Date: Wed, 13 Jul 2011 17:32:21 GMT + Server: Apache + Content-Type: text/plain; charset=UTF-8 + + To create container using cURL (for the above example), run the + following command: + + curl -v -X PUT -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test/pictures -k + + The status code of 201 (Created) indicates that you have + successfully created the container. If a container with same is + already existed, the status code of 202 is displayed. + +### Displaying Objects of a Container + +You can list the objects of a container using GET command. You can use +the following optional parameters with GET request to refine the +results: + + Parameter Description + ----------- -------------------------------------------------------------------------------------------------------------- + limit Limits the number of results to at most *n* value. + marker Returns object names greater in value than the specified marker. + prefix Displays the results limited to object names beginning with the substring x. beginning with the substring x. + path Returns the object names nested in the pseudo path. + format Specify either json or xml to return the respective serialized response. + delimiter Returns all the object names nested in the container. + +To display objects of a container + +- List objects of a specific container using the following command: + +<!-- --> + + GET /<apiversion>/<account>/<container>[parm=value] HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <authentication-token-key> + +For example, + + GET /v1/AUTH_test/images HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + + HTTP/1.1 200 Ok + Date: Wed, 13 Jul 2011 15:42:21 GMT + Server: Apache + Content-Type: text/plain; charset=UTF-8 + Content-Length: 139 + + sample file.jpg + test-file.pdf + You and Me.pdf + Puddle of Mudd.mp3 + Test Reports.doc + +To display objects of a container using cURL (for the above example), +run the following command: + + curl -v -X GET-H 'X-Auth-Token: AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test/images -k + +### Displaying Container Metadata Information + +You can issue HEAD command to the storage service to view the number of +objects in a container and the total bytes of all the objects stored in +the container. + +- To display list of objects and storage used, run the following + command: + + HEAD /<apiversion>/<account>/<container> HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <authentication-token-key> + + For example, + + HEAD /v1/AUTH_test/images HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + + HTTP/1.1 204 No Content + Date: Wed, 13 Jul 2011 19:52:21 GMT + Server: Apache + X-Account-Object-Count: 8 + X-Container-Bytes-Used: 472 + + To display list of objects and storage used in a container using + cURL (for the above example), run the following command: + + curl -v -X HEAD -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test/images -k + +### Deleting Container + +You can use DELETE command to permanently delete containers. The +container must be empty before it can be deleted. + +You can issue HEAD command to determine if it contains any objects. + +- To delete a container, run the following command: + + DELETE /<apiversion>/<account>/<container>/ HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <authentication-token-key> + + For example, + + DELETE /v1/AUTH_test/pictures HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + + HTTP/1.1 204 No Content + Date: Wed, 13 Jul 2011 17:52:21 GMT + Server: Apache + Content-Length: 0 + Content-Type: text/plain; charset=UTF-8 + + To delete a container using cURL (for the above example), run the + following command: + + curl -v -X DELETE -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test/pictures -k + + The status code of 204 (No Content) indicates that you have + successfully deleted the container. If that container does not + exist, the status code 404 (Not Found) is displayed, and if the + container is not empty, the status code 409 (Conflict) is displayed. + +### Updating Container Metadata + +You can update the metadata of container using POST operation, metadata +keys should be prefixed with 'x-container-meta'. + +- To update the metadata of the object, run the following command: + + POST /<apiversion>/<account>/<container> HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <Authentication-token-key> + X-Container-Meta-<key>: <new value> + X-Container-Meta-<key>: <new value> + + For example, + + POST /v1/AUTH_test/images HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + X-Container-Meta-Zoo: Lion + X-Container-Meta-Home: Dog + + HTTP/1.1 204 No Content + Date: Wed, 13 Jul 2011 20:52:21 GMT + Server: Apache + Content-Type: text/plain; charset=UTF-8 + + To update the metadata of the object using cURL (for the above + example), run the following command: + + curl -v -X POST -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test/images -H ' X-Container-Meta-Zoo: Lion' -H 'X-Container-Meta-Home: Dog' -k + + The status code of 204 (No Content) indicates the container's + metadata is updated successfully. If that object does not exist, the + status code 404 (Not Found) is displayed. + +### Setting ACLs on Container + +You can set the container access control list by using POST command on +container with `x- container-read` and` x-container-write` keys. + +The ACL format is `[item[,item...]]`. Each item can be a group name to +give access to or a referrer designation to grant or deny based on the +HTTP Referer header. + +The referrer designation format is:` .r:[-]value`. + +The .r can also be `.ref, .referer, `or .`referrer`; though it will be +shortened to.r for decreased character count usage. The value can be `*` +to specify any referrer host is allowed access. The leading minus sign +(-) indicates referrer hosts that should be denied access. + +Examples of valid ACLs: + + .r:* + .r:*,bobs_account,sues_account:sue + bobs_account,sues_account:sue + +Examples of invalid ACLs: + + .r: + .r:- + +By default, allowing read access via `r `will not allow listing objects +in the container but allows retrieving objects from the container. To +turn on listings, use the .`rlistings` directive. Also, `.r` +designations are not allowed in headers whose names include the word +write. + +For example, to set all the objects access rights to "public" inside the +container using cURL (for the above example), run the following command: + + curl -v -X POST -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test/images + -H 'X-Container-Read: .r:*' -k + +Working with Objects +-------------------- + +An object represents the data and any metadata for the files stored in +the system. Through the REST interface, metadata for an object can be +included by adding custom HTTP headers to the request and the data +payload as the request body. Objects name should not exceed 1024 bytes +after URL encoding. + +This section describes the list of operations you can perform at the +object level of the URL. + +### Creating or Updating Object + +You can use PUT command to write or update an object's content and +metadata. + +You can verify the data integrity by including an MD5checksum for the +object's data in the ETag header. ETag header is optional and can be +used to ensure that the object's contents are stored successfully in the +storage system. + +You can assign custom metadata to objects by including additional HTTP +headers on the PUT request. The objects created with custom metadata via +HTTP headers are identified with the`X-Object- Meta`- prefix. + +- To create or update an object, run the following command: + + PUT /<apiversion>/<account>/<container>/<object> HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <authentication-token-key> + ETag: da1e100dc9e7becc810986e37875ae38 + Content-Length: 342909 + X-Object-Meta-PIN: 2343 + + For example, + + PUT /v1/AUTH_test/pictures/dog HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + ETag: da1e100dc9e7becc810986e37875ae38 + + HTTP/1.1 201 Created + Date: Wed, 13 Jul 2011 18:32:21 GMT + Server: Apache + ETag: da1e100dc9e7becc810986e37875ae38 + Content-Length: 0 + Content-Type: text/plain; charset=UTF-8 + + To create or update an object using cURL (for the above example), + run the following command: + + curl -v -X PUT -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test/pictures/dog -H 'Content- + Length: 0' -k + + The status code of 201 (Created) indicates that you have + successfully created or updated the object. If there is a missing + content-Length or Content-Type header in the request, the status + code of 412 (Length Required) is displayed. (Optionally) If the MD5 + checksum of the data written to the storage system does not match + the ETag value, the status code of 422 (Unprocessable Entity) is + displayed. + +#### Chunked Transfer Encoding + +You can upload data without knowing the size of the data to be uploaded. +You can do this by specifying an HTTP header of Transfer-Encoding: +chunked and without using a Content-Length header. + +You can use this feature while doing a DB dump, piping the output +through gzip, and then piping the data directly into Object Storage +without having to buffer the data to disk to compute the file size. + +- To create or update an object, run the following command: + + PUT /<apiversion>/<account>/<container>/<object> HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <authentication-token-key> + Transfer-Encoding: chunked + X-Object-Meta-PIN: 2343 + + For example, + + PUT /v1/AUTH_test/pictures/cat HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + Transfer-Encoding: chunked + X-Object-Meta-PIN: 2343 + 19 + A bunch of data broken up + D + into chunks. + 0 + +### Copying Object + +You can copy object from one container to another or add a new object +and then add reference to designate the source of the data from another +container. + +**To copy object from one container to another** + +- To add a new object and designate the source of the data from + another container, run the following command: + + COPY /<apiversion>/<account>/<container>/<sourceobject> HTTP/1.1 + Host: <storage URL> + X-Auth-Token: < authentication-token-key> + Destination: /<container>/<destinationobject> + + For example, + + COPY /v1/AUTH_test/images/dogs HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + Destination: /photos/cats + + HTTP/1.1 201 Created + Date: Wed, 13 Jul 2011 18:32:21 GMT + Server: Apache + Content-Length: 0 + Content-Type: text/plain; charset=UTF-8 + + To copy an object using cURL (for the above example), run the + following command: + + curl -v -X COPY -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' -H 'Destination: /photos/cats' -k https://example.storage.com:443/v1/AUTH_test/images/dogs + + The status code of 201 (Created) indicates that you have + successfully copied the object. If there is a missing content-Length + or Content-Type header in the request, the status code of 412 + (Length Required) is displayed. + + You can also use PUT command to copy object by using additional + header `X-Copy-From: container/obj`. + +- To use PUT command to copy an object, run the following command: + + PUT /v1/AUTH_test/photos/cats HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + X-Copy-From: /images/dogs + + HTTP/1.1 201 Created + Date: Wed, 13 Jul 2011 18:32:21 GMT + Server: Apache + Content-Type: text/plain; charset=UTF-8 + + To copy an object using cURL (for the above example), run the + following command: + + curl -v -X PUT -H 'X-Auth-Token: AUTH_tkde3ad38b087b49bbbac0494f7600a554' + -H 'X-Copy-From: /images/dogs' –k + https://example.storage.com:443/v1/AUTH_test/images/cats + + The status code of 201 (Created) indicates that you have + successfully copied the object. + +### Displaying Object Information + +You can issue GET command on an object to view the object data of the +object. + +- To display the content of an object run the following command: + + GET /<apiversion>/<account>/<container>/<object> HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <Authentication-token-key> + + For example, + + GET /v1/AUTH_test/images/cat HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + + HTTP/1.1 200 Ok + Date: Wed, 13 Jul 2011 23:52:21 GMT + Server: Apache + Last-Modified: Thu, 14 Jul 2011 13:40:18 GMT + ETag: 8a964ee2a5e88be344f36c22562a6486 + Content-Length: 534210 + [.........] + + To display the content of an object using cURL (for the above + example), run the following command: + + curl -v -X GET -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test/images/cat -k + + The status code of 200 (Ok) indicates the object's data is displayed + successfully. If that object does not exist, the status code 404 + (Not Found) is displayed. + +### Displaying Object Metadata + +You can issue HEAD command on an object to view the object metadata and +other standard HTTP headers. You must send only authorization token as +header. + +- To display the metadata of the object, run the following command: + +<!-- --> + + HEAD /<apiversion>/<account>/<container>/<object> HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <Authentication-token-key> + +For example, + + HEAD /v1/AUTH_test/images/cat HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + + HTTP/1.1 204 No Content + Date: Wed, 13 Jul 2011 21:52:21 GMT + Server: Apache + Last-Modified: Thu, 14 Jul 2011 13:40:18 GMT + ETag: 8a964ee2a5e88be344f36c22562a6486 + Content-Length: 512000 + Content-Type: text/plain; charset=UTF-8 + X-Object-Meta-House: Cat + X-Object-Meta-Zoo: Cat + X-Object-Meta-Home: Cat + X-Object-Meta-Park: Cat + +To display the metadata of the object using cURL (for the above +example), run the following command: + + curl -v -X HEAD -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test/images/cat -k + +The status code of 204 (No Content) indicates the object's metadata is +displayed successfully. If that object does not exist, the status code +404 (Not Found) is displayed. + +### Updating Object Metadata + +You can issue POST command on an object name only to set or overwrite +arbitrary key metadata. You cannot change the object's other headers +such as Content-Type, ETag and others using POST operation. The POST +command will delete all the existing metadata and replace it with the +new arbitrary key metadata. + +You must prefix **X-Object-Meta-** to the key names. + +- To update the metadata of an object, run the following command: + + POST /<apiversion>/<account>/<container>/<object> HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <Authentication-token-key> + X-Object-Meta-<key>: <new value> + X-Object-Meta-<key>: <new value> + + For example, + + POST /v1/AUTH_test/images/cat HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + X-Object-Meta-Zoo: Lion + X-Object-Meta-Home: Dog + + HTTP/1.1 202 Accepted + Date: Wed, 13 Jul 2011 22:52:21 GMT + Server: Apache + Content-Length: 0 + Content-Type: text/plain; charset=UTF-8 + + To update the metadata of an object using cURL (for the above + example), run the following command: + + curl -v -X POST -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test/images/cat -H ' X-Object- + Meta-Zoo: Lion' -H 'X-Object-Meta-Home: Dog' -k + + The status code of 202 (Accepted) indicates that you have + successfully updated the object's metadata. If that object does not + exist, the status code 404 (Not Found) is displayed. + +### Deleting Object + +You can use DELETE command to permanently delete the object. + +The DELETE command on an object will be processed immediately and any +subsequent operations like GET, HEAD, POST, or DELETE on the object will +display 404 (Not Found) error. + +- To delete an object, run the following command: + + DELETE /<apiversion>/<account>/<container>/<object> HTTP/1.1 + Host: <storage URL> + X-Auth-Token: <Authentication-token-key> + + For example, + + DELETE /v1/AUTH_test/pictures/cat HTTP/1.1 + Host: example.storage.com + X-Auth-Token: AUTH_tkd3ad38b087b49bbbac0494f7600a554 + + HTTP/1.1 204 No Content + Date: Wed, 13 Jul 2011 20:52:21 GMT + Server: Apache + Content-Type: text/plain; charset=UTF-8 + + To delete an object using cURL (for the above example), run the + following command: + + curl -v -X DELETE -H 'X-Auth-Token: + AUTH_tkde3ad38b087b49bbbac0494f7600a554' + https://example.storage.com:443/v1/AUTH_test/pictures/cat -k + + The status code of 204 (No Content) indicates that you have + successfully deleted the object. If that object does not exist, the + status code 404 (Not Found) is displayed. + + []: http://download.gluster.com/pub/gluster/glusterfs/3.2/UFO/ diff --git a/doc/admin-guide/en-US/markdown/admin_commandref.md b/doc/admin-guide/en-US/markdown/admin_commandref.md new file mode 100644 index 000000000..4ff05f4ef --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_commandref.md @@ -0,0 +1,180 @@ +Command Reference +================= + +This section describes the available commands and includes the following +section: + +- gluster Command + + Gluster Console Manager (command line interpreter) + +- glusterd Daemon + + Gluster elastic volume management daemon + +gluster Command +=============== + +**NAME** + +gluster - Gluster Console Manager (command line interpreter) + +**SYNOPSIS** + +To run the program and display the gluster prompt: + +**gluster** + +To specify a command directly: gluster [COMMANDS] [OPTIONS] + +**DESCRIPTION** + +The Gluster Console Manager is a command line utility for elastic volume +management. You can run the gluster command on any export server. The +command enables administrators to perform cloud operations such as +creating, expanding, shrinking, rebalancing, and migrating volumes +without needing to schedule server downtime. + +**COMMANDS** + + --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Command Description + ---------------------------------------------------------------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + **Volume** + + volume info [all | VOLNAME] Displays information about all volumes, or the specified volume. + + volume create NEW-VOLNAME [stripe COUNT] [replica COUNT] [transport tcp | rdma | tcp,rdma] NEW-BRICK ... Creates a new volume of the specified type using the specified bricks and transport type (the default transport type is tcp). + + volume delete VOLNAME Deletes the specified volume. + + volume start VOLNAME Starts the specified volume. + + volume stop VOLNAME [force] Stops the specified volume. + + volume rename VOLNAME NEW-VOLNAME Renames the specified volume. + + volume help Displays help for the volume command. + + **Brick** + + volume add-brick VOLNAME NEW-BRICK ... Adds the specified brick to the specified volume. + + volume replace-brick VOLNAME (BRICK NEW-BRICK) start | pause | abort | status Replaces the specified brick. + + volume remove-brick VOLNAME [(replica COUNT)|(stripe COUNT)] BRICK ... Removes the specified brick from the specified volume. + + **Rebalance** + + volume rebalance VOLNAME start Starts rebalancing the specified volume. + + volume rebalance VOLNAME stop Stops rebalancing the specified volume. + + volume rebalance VOLNAME status Displays the rebalance status of the specified volume. + + **Log** + + volume log filename VOLNAME [BRICK] DIRECTORY Sets the log directory for the corresponding volume/brick. + + volume log rotate VOLNAME [BRICK] Rotates the log file for corresponding volume/brick. + + volume log locate VOLNAME [BRICK] Locates the log file for corresponding volume/brick. + + **Peer** + + peer probe HOSTNAME Probes the specified peer. + + peer detach HOSTNAME Detaches the specified peer. + + peer status Displays the status of peers. + + peer help Displays help for the peer command. + + **Geo-replication** + + volume geo-replication MASTER SLAVE start Start geo-replication between the hosts specified by MASTER and SLAVE. You can specify a local master volume as :VOLNAME. + + You can specify a local slave volume as :VOLUME and a local slave directory as /DIRECTORY/SUB-DIRECTORY. You can specify a remote slave volume as DOMAIN::VOLNAME and a remote slave directory as DOMAIN:/DIRECTORY/SUB-DIRECTORY. + + volume geo-replication MASTER SLAVE stop Stop geo-replication between the hosts specified by MASTER and SLAVE. You can specify a local master volume as :VOLNAME and a local master directory as /DIRECTORY/SUB-DIRECTORY. + + You can specify a local slave volume as :VOLNAME and a local slave directory as /DIRECTORY/SUB-DIRECTORY. You can specify a remote slave volume as DOMAIN::VOLNAME and a remote slave directory as DOMAIN:/DIRECTORY/SUB-DIRECTORY. + + volume geo-replication MASTER SLAVE config [options] Configure geo-replication options between the hosts specified by MASTER and SLAVE. + + gluster-command COMMAND The path where the gluster command is installed. + + gluster-log-level LOGFILELEVEL The log level for gluster processes. + + log-file LOGFILE The path to the geo-replication log file. + + log-level LOGFILELEVEL The log level for geo-replication. + + remote-gsyncd COMMAND The path where the gsyncd binary is installed on the remote machine. + + ssh-command COMMAND The ssh command to use to connect to the remote machine (the default is ssh). + + rsync-command COMMAND The rsync command to use for synchronizing the files (the default is rsync). + + volume\_id= UID The command to delete the existing master UID for the intermediate/slave node. + + timeout SECONDS The timeout period. + + sync-jobs N The number of simultaneous files/directories that can be synchronized. + + ignore-deletes If this option is set to 1, a file deleted on master will not trigger a delete operation on the slave. Hence, the slave will remain as a superset of the master and can be used to recover the master in case of crash and/or accidental delete. + + **Other** + + help Display the command options. + + quit Exit the gluster command line interface. + --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +**FILES** + +/var/lib/glusterd/\* + +**SEE ALSO** + +fusermount(1), mount.glusterfs(8), glusterfs-volgen(8), glusterfs(8), +glusterd(8) + +glusterd Daemon +=============== + +**NAME** + +glusterd - Gluster elastic volume management daemon + +**SYNOPSIS** + +glusterd [OPTION...] + +**DESCRIPTION** + +The glusterd daemon is used for elastic volume management. The daemon +must be run on all export servers. + +**OPTIONS** + + Option Description + ----------------------------------- ---------------------------------------------------------------------------------------------------------------- + **Basic** + -l=LOGFILE, --log-file=LOGFILE Files to use for logging (the default is /usr/local/var/log/glusterfs/glusterfs.log). + -L=LOGLEVEL, --log-level=LOGLEVEL Logging severity. Valid options are TRACE, DEBUG, INFO, WARNING, ERROR and CRITICAL (the default is INFO). + --debug Runs the program in debug mode. This option sets --no-daemon, --log-level to DEBUG, and --log-file to console. + -N, --no-daemon Runs the program in the foreground. + **Miscellaneous** + -?, --help Displays this help. + --usage Displays a short usage message. + -V, --version Prints the program version. + +**FILES** + +/var/lib/glusterd/\* + +**SEE ALSO** + +fusermount(1), mount.glusterfs(8), glusterfs-volgen(8), glusterfs(8), +gluster(8) diff --git a/doc/admin-guide/en-US/markdown/admin_console.md b/doc/admin-guide/en-US/markdown/admin_console.md new file mode 100644 index 000000000..9b69de02d --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_console.md @@ -0,0 +1,51 @@ +Using the Gluster Console Manager – Command Line Utility +======================================================== + +The Gluster Console Manager is a single command line utility that +simplifies configuration and management of your storage environment. The +Gluster Console Manager is similar to the LVM (Logical Volume Manager) +CLI or ZFS Command Line Interface, but across multiple storage servers. +You can use the Gluster Console Manager online, while volumes are +mounted and active. Gluster automatically synchronizes volume +configuration information across all Gluster servers. + +Using the Gluster Console Manager, you can create new volumes, start +volumes, and stop volumes, as required. You can also add bricks to +volumes, remove bricks from existing volumes, as well as change +translator settings, among other operations. + +You can also use the commands to create scripts for automation, as well +as use the commands as an API to allow integration with third-party +applications. + +**Running the Gluster Console Manager** + +You can run the Gluster Console Manager on any GlusterFS server either +by invoking the commands or by running the Gluster CLI in interactive +mode. You can also use the gluster command remotely using SSH. + +- To run commands directly: + + ` # gluster peer ` + + For example: + + ` # gluster peer status ` + +- To run the Gluster Console Manager in interactive mode + + `# gluster` + + You can execute gluster commands from the Console Manager prompt: + + ` gluster> ` + + For example, to view the status of the peer server: + + \# `gluster ` + + `gluster > peer status ` + + Display the status of the peer. + + diff --git a/doc/admin-guide/en-US/markdown/admin_directory_Quota.md b/doc/admin-guide/en-US/markdown/admin_directory_Quota.md new file mode 100644 index 000000000..09c757781 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_directory_Quota.md @@ -0,0 +1,172 @@ +Managing Directory Quota +======================== + +Directory quotas in GlusterFS allow you to set limits on usage of disk +space by directories or volumes. The storage administrators can control +the disk space utilization at the directory and/or volume levels in +GlusterFS by setting limits to allocatable disk space at any level in +the volume and directory hierarchy. This is particularly useful in cloud +deployments to facilitate utility billing model. + +> **Note** +> +> For now, only Hard limit is supported. Here, the limit cannot be +> exceeded and attempts to use more disk space or inodes beyond the set +> limit will be denied. + +System administrators can also monitor the resource utilization to limit +the storage for the users depending on their role in the organization. + +You can set the quota at the following levels: + +- Directory level – limits the usage at the directory level + +- Volume level – limits the usage at the volume level + +> **Note** +> +> You can set the disk limit on the directory even if it is not created. +> The disk limit is enforced immediately after creating that directory. +> For more information on setting disk limit, see ?. + +Enabling Quota +============== + +You must enable Quota to set disk limits. + +**To enable quota** + +- Enable the quota using the following command: + + `# gluster volume quota enable ` + + For example, to enable quota on test-volume: + + # gluster volume quota test-volume enable + Quota is enabled on /test-volume + +Disabling Quota +=============== + +You can disable Quota, if needed. + +**To disable quota:** + +- Disable the quota using the following command: + + `# gluster volume quota disable ` + + For example, to disable quota translator on test-volume: + + # gluster volume quota test-volume disable + Quota translator is disabled on /test-volume + +Setting or Replacing Disk Limit +=============================== + +You can create new directories in your storage environment and set the +disk limit or set disk limit for the existing directories. The directory +name should be relative to the volume with the export directory/mount +being treated as "/". + +**To set or replace disk limit** + +- Set the disk limit using the following command: + + `# gluster volume quota limit-usage /` + + For example, to set limit on data directory on test-volume where + data is a directory under the export directory: + + # gluster volume quota test-volume limit-usage /data 10GB + Usage limit has been set on /data + + > **Note** + > + > In a multi-level directory hierarchy, the strictest disk limit + > will be considered for enforcement. + +Displaying Disk Limit Information +================================= + +You can display disk limit information on all the directories on which +the limit is set. + +**To display disk limit information** + +- Display disk limit information of all the directories on which limit + is set, using the following command: + + `# gluster volume quota list` + + For example, to see the set disks limit on test-volume: + + # gluster volume quota test-volume list + + + /Test/data 10 GB 6 GB + /Test/data1 10 GB 4 GB + +- Display disk limit information on a particular directory on which + limit is set, using the following command: + + `# gluster volume quota list ` + + For example, to see the set limit on /data directory of test-volume: + + # gluster volume quota test-volume list /data + + + /Test/data 10 GB 6 GB + +Updating Memory Cache Size +========================== + +For performance reasons, quota caches the directory sizes on client. You +can set timeout indicating the maximum valid duration of directory sizes +in cache, from the time they are populated. + +For example: If there are multiple clients writing to a single +directory, there are chances that some other client might write till the +quota limit is exceeded. However, this new file-size may not get +reflected in the client till size entry in cache has become stale +because of timeout. If writes happen on this client during this +duration, they are allowed even though they would lead to exceeding of +quota-limits, since size in cache is not in sync with the actual size. +When timeout happens, the size in cache is updated from servers and will +be in sync and no further writes will be allowed. A timeout of zero will +force fetching of directory sizes from server for every operation that +modifies file data and will effectively disables directory size caching +on client side. + +**To update the memory cache size** + +- Update the memory cache size using the following command: + + `# gluster volume set features.quota-timeout` + + For example, to update the memory cache size for every 5 seconds on + test-volume: + + # gluster volume set test-volume features.quota-timeout 5 + Set volume successful + +Removing Disk Limit +=================== + +You can remove set disk limit, if you do not want quota anymore. + +**To remove disk limit** + +- Remove disk limit set on a particular directory using the following + command: + + `# gluster volume quota remove ` + + For example, to remove the disk limit on /data directory of + test-volume: + + # gluster volume quota test-volume remove /data + Usage limit set on /data is removed + + diff --git a/doc/admin-guide/en-US/markdown/admin_geo-replication.md b/doc/admin-guide/en-US/markdown/admin_geo-replication.md new file mode 100644 index 000000000..849957244 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_geo-replication.md @@ -0,0 +1,738 @@ +Managing Geo-replication +======================== + +Geo-replication provides a continuous, asynchronous, and incremental +replication service from one site to another over Local Area Networks +(LANs), Wide Area Network (WANs), and across the Internet. + +Geo-replication uses a master–slave model, whereby replication and +mirroring occurs between the following partners: + +- Master – a GlusterFS volume + +- Slave – a slave which can be of the following types: + + - A local directory which can be represented as file URL like + `file:///path/to/dir`. You can use shortened form, for example, + ` /path/to/dir`. + + - A GlusterFS Volume - Slave volume can be either a local volume + like `gluster://localhost:volname` (shortened form - `:volname`) + or a volume served by different host like + `gluster://host:volname` (shortened form - `host:volname`). + + > **Note** + > + > Both of the above types can be accessed remotely using SSH tunnel. + > To use SSH, add an SSH prefix to either a file URL or gluster type + > URL. For example, ` ssh://root@remote-host:/path/to/dir` + > (shortened form - `root@remote-host:/path/to/dir`) or + > `ssh://root@remote-host:gluster://localhost:volname` (shortened + > from - `root@remote-host::volname`). + +This section introduces Geo-replication, illustrates the various +deployment scenarios, and explains how to configure the system to +provide replication and mirroring in your environment. + +Replicated Volumes vs Geo-replication +===================================== + +The following table lists the difference between replicated volumes and +geo-replication: + + Replicated Volumes Geo-replication + --------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------- + Mirrors data across clusters Mirrors data across geographically distributed clusters + Provides high-availability Ensures backing up of data for disaster recovery + Synchronous replication (each and every file operation is sent across all the bricks) Asynchronous replication (checks for the changes in files periodically and syncs them on detecting differences) + +Preparing to Deploy Geo-replication +=================================== + +This section provides an overview of the Geo-replication deployment +scenarios, describes how you can check the minimum system requirements, +and explores common deployment scenarios. + +- ? + +- ? + +- ? + +- ? + +- ? + +Exploring Geo-replication Deployment Scenarios +---------------------------------------------- + +Geo-replication provides an incremental replication service over Local +Area Networks (LANs), Wide Area Network (WANs), and across the Internet. +This section illustrates the most common deployment scenarios for +Geo-replication, including the following: + +- Geo-replication over LAN + +- Geo-replication over WAN + +- Geo-replication over the Internet + +- Multi-site cascading Geo-replication + +**Geo-replication over LAN** + +You can configure Geo-replication to mirror data over a Local Area +Network. + +![ Geo-replication over LAN ][] + +**Geo-replication over WAN** + +You can configure Geo-replication to replicate data over a Wide Area +Network. + +![ Geo-replication over WAN ][] + +**Geo-replication over Internet** + +You can configure Geo-replication to mirror data over the Internet. + +![ Geo-replication over Internet ][] + +**Multi-site cascading Geo-replication** + +You can configure Geo-replication to mirror data in a cascading fashion +across multiple sites. + +![ Multi-site cascading Geo-replication ][] + +Geo-replication Deployment Overview +----------------------------------- + +Deploying Geo-replication involves the following steps: + +1. Verify that your environment matches the minimum system requirement. + For more information, see ?. + +2. Determine the appropriate deployment scenario. For more information, + see ?. + +3. Start Geo-replication on master and slave systems, as required. For + more information, see ?. + +Checking Geo-replication Minimum Requirements +--------------------------------------------- + +Before deploying GlusterFS Geo-replication, verify that your systems +match the minimum requirements. + +The following table outlines the minimum requirements for both master +and slave nodes within your environment: + + Component Master Slave + ------------------------ --------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Operating System GNU/Linux GNU/Linux + Filesystem GlusterFS 3.2 or higher GlusterFS 3.2 or higher (GlusterFS needs to be installed, but does not need to be running), ext3, ext4, or XFS (any other POSIX compliant file system would work, but has not been tested extensively) + Python Python 2.4 (with ctypes external module), or Python 2.5 (or higher) Python 2.4 (with ctypes external module), or Python 2.5 (or higher) + Secure shell OpenSSH version 4.0 (or higher) SSH2-compliant daemon + Remote synchronization rsync 3.0.7 or higher rsync 3.0.7 or higher + FUSE GlusterFS supported versions GlusterFS supported versions + +Setting Up the Environment for Geo-replication +---------------------------------------------- + +**Time Synchronization** + +- On bricks of a geo-replication master volume, all the servers' time + must be uniform. You are recommended to set up NTP (Network Time + Protocol) service to keep the bricks sync in time and avoid + out-of-time sync effect. + + For example: In a Replicated volume where brick1 of the master is at + 12.20 hrs and brick 2 of the master is at 12.10 hrs with 10 minutes + time lag, all the changes in brick2 between this period may go + unnoticed during synchronization of files with Slave. + + For more information on setting up NTP, see [][]. + +**To setup Geo-replication for SSH** + +Password-less login has to be set up between the host machine (where +geo-replication Start command will be issued) and the remote machine +(where slave process should be launched through SSH). + +1. On the node where geo-replication sessions are to be set up, run the + following command: + + `# ssh-keygen -f /var/lib/glusterd/geo-replication/secret.pem` + + Press Enter twice to avoid passphrase. + +2. Run the following command on master for all the slave hosts: + + `# ssh-copy-id -i /var/lib/glusterd/geo-replication/secret.pem.pub @` + +Setting Up the Environment for a Secure Geo-replication Slave +------------------------------------------------------------- + +You can configure a secure slave using SSH so that master is granted a +restricted access. With GlusterFS, you need not specify configuration +parameters regarding the slave on the master-side configuration. For +example, the master does not require the location of the rsync program +on slave but the slave must ensure that rsync is in the PATH of the user +which the master connects using SSH. The only information that master +and slave have to negotiate are the slave-side user account, slave's +resources that master uses as slave resources, and the master's public +key. Secure access to the slave can be established using the following +options: + +- Restricting Remote Command Execution + +- Using `Mountbroker` for Slaves + +- Using IP based Access Control + +**Backward Compatibility** + +Your existing Ge-replication environment will work with GlusterFS, +except for the following: + +- The process of secure reconfiguration affects only the glusterfs + instance on slave. The changes are transparent to master with the + exception that you may have to change the SSH target to an + unprivileged account on slave. + +- The following are the some exceptions where this might not work: + + - Geo-replication URLs which specify the slave resource when + configuring master will include the following special + characters: space, \*, ?, [; + + - Slave must have a running instance of glusterd, even if there is + no gluster volume among the mounted slave resources (that is, + file tree slaves are used exclusively) . + +### Restricting Remote Command Execution + +If you restrict remote command execution, then the Slave audits commands +coming from the master and the commands related to the given +geo-replication session is allowed. The Slave also provides access only +to the files within the slave resource which can be read or manipulated +by the Master. + +To restrict remote command execution: + +1. Identify the location of the gsyncd helper utility on Slave. This + utility is installed in `PREFIX/libexec/glusterfs/gsyncd`, where + PREFIX is a compile-time parameter of glusterfs. For example, + `--prefix=PREFIX` to the configure script with the following common + values` /usr, /usr/local, and /opt/glusterfs/glusterfs_version`. + +2. Ensure that command invoked from master to slave passed through the + slave's gsyncd utility. + + You can use either of the following two options: + + - Set gsyncd with an absolute path as the shell for the account + which the master connects through SSH. If you need to use a + privileged account, then set it up by creating a new user with + UID 0. + + - Setup key authentication with command enforcement to gsyncd. You + must prefix the copy of master's public key in the Slave + account's `authorized_keys` file with the following command: + + `command=<path to gsyncd>`. + + For example, + `command="PREFIX/glusterfs/gsyncd" ssh-rsa AAAAB3Nza....` + +### Using Mountbroker for Slaves + +`mountbroker` is a new service of glusterd. This service allows an +unprivileged process to own a GlusterFS mount by registering a label +(and DSL (Domain-specific language) options ) with glusterd through a +glusterd volfile. Using CLI, you can send a mount request to glusterd to +receive an alias (symlink) of the mounted volume. + +A request from the agent , the unprivileged slave agents use the +mountbroker service of glusterd to set up an auxiliary gluster mount for +the agent in a special environment which ensures that the agent is only +allowed to access with special parameters that provide administrative +level access to the particular volume. + +**To setup an auxiliary gluster mount for the agent**: + +1. Create a new group. For example, `geogroup`. + +2. Create a unprivileged account. For example, ` geoaccount`. Make it a + member of ` geogroup`. + +3. Create a new directory owned by root and with permissions *0711.* + For example, create a create mountbroker-root directory + `/var/mountbroker-root`. + +4. Add the following options to the glusterd volfile, assuming the name + of the slave gluster volume as `slavevol`: + + `option mountbroker-root /var/mountbroker-root ` + + `option mountbroker-geo-replication.geoaccount slavevol` + + `option geo-replication-log-group geogroup` + + If you are unable to locate the glusterd volfile at + `/etc/glusterfs/glusterd.vol`, you can create a volfile containing + both the default configuration and the above options and place it at + `/etc/glusterfs/`. + + A sample glusterd volfile along with default options: + + volume management + type mgmt/glusterd + option working-directory /var/lib/glusterd + option transport-type socket,rdma + option transport.socket.keepalive-time 10 + option transport.socket.keepalive-interval 2 + option transport.socket.read-fail-log off + + option mountbroker-root /var/mountbroker-root + option mountbroker-geo-replication.geoaccount slavevol + option geo-replication-log-group geogroup + end-volume + + If you host multiple slave volumes on Slave, you can repeat step 2. + for each of them and add the following options to the `volfile`: + + option mountbroker-geo-replication.geoaccount2 slavevol2 + option mountbroker-geo-replication.geoaccount3 slavevol3 + +5. Setup Master to access Slave as `geoaccount@Slave`. + + You can add multiple slave volumes within the same account + (geoaccount) by providing comma-separated list (without spaces) as + the argument of `mountbroker-geo-replication.geogroup`. You can also + have multiple options of the form `mountbroker-geo-replication.*`. + It is recommended to use one service account per Master machine. For + example, if there are multiple slave volumes on Slave for the master + machines Master1, Master2, and Master3, then create a dedicated + service user on Slave for them by repeating Step 2. for each (like + geogroup1, geogroup2, and geogroup3), and then add the following + corresponding options to the volfile: + + `option mountbroker-geo-replication.geoaccount1 slavevol11,slavevol12,slavevol13` + + `option mountbroker-geo-replication.geoaccount2 slavevol21,slavevol22` + + `option mountbroker-geo-replication.geoaccount3 slavevol31` + + Now set up Master1 to ssh to geoaccount1@Slave, etc. + + You must restart glusterd after making changes in the configuration + to effect the updates. + +### Using IP based Access Control + +You can use IP based access control method to provide access control for +the slave resources using IP address. You can use method for both Slave +and file tree slaves, but in the section, we are focusing on file tree +slaves using this method. + +To set access control based on IP address for file tree slaves: + +1. Set a general restriction for accessibility of file tree resources: + + `# gluster volume geo-replication '/*' config allow-network ::1,127.0.0.1 ` + + This will refuse all requests for spawning slave agents except for + requests initiated locally. + +2. If you want the to lease file tree at `/data/slave-tree` to Master, + enter the following command: + + `# gluster volume geo-replicationconfig allow-network ` + + `MasterIP` is the IP address of Master. The slave agent spawn + request from master will be accepted if it is executed at + `/data/slave-tree`. + +If the Master side network configuration does not enable the Slave to +recognize the exact IP address of Master, you can use CIDR notation to +specify a subnet instead of a single IP address as MasterIP or even +comma-separated lists of CIDR subnets. + +If you want to extend IP based access control to gluster slaves, use the +following command: + +`# gluster volume geo-replication '*' config allow-network ::1,127.0.0.1` + +Starting Geo-replication +======================== + +This section describes how to configure and start Gluster +Geo-replication in your storage environment, and verify that it is +functioning correctly. + +- ? + +- ? + +- ? + +- ? + +- ? + +Starting Geo-replication +------------------------ + +To start Gluster Geo-replication + +- Start geo-replication between the hosts using the following command: + + `# gluster volume geo-replication start` + + For example: + + # gluster volume geo-replication Volume1 example.com:/data/remote_dir start + Starting geo-replication session between Volume1 + example.com:/data/remote_dir has been successful + + > **Note** + > + > You may need to configure the service before starting Gluster + > Geo-replication. For more information, see ?. + +Verifying Successful Deployment +------------------------------- + +You can use the gluster command to verify the status of Gluster +Geo-replication in your environment. + +**To verify the status Gluster Geo-replication** + +- Verify the status by issuing the following command on host: + + `# gluster volume geo-replication status` + + For example: + + `# gluster volume geo-replication Volume1 example.com:/data/remote_dir status` + + # gluster volume geo-replication Volume1 example.com:/data/remote_dir status + + MASTER SLAVE STATUS + ______ ______________________________ ____________ + Volume1 root@example.com:/data/remote_dir Starting.... + +Displaying Geo-replication Status Information +--------------------------------------------- + +You can display status information about a specific geo-replication +master session, or a particular master-slave session, or all +geo-replication sessions, as needed. + +**To display geo-replication status information** + +- Display information of all geo-replication sessions using the + following command: + + # gluster volume geo-replication Volume1 example.com:/data/remote_dir status + + MASTER SLAVE STATUS + ______ ______________________________ ____________ + Volume1 root@example.com:/data/remote_dir Starting.... + +- Display information of a particular master slave session using the + following command: + + `# gluster volume geo-replication status` + + For example, to display information of Volume1 and + example.com:/data/remote\_dir + + `# gluster volume geo-replication Volume1 example.com:/data/remote_dir status` + + The status of the geo-replication between Volume1 and + example.com:/data/remote\_dir is displayed. + +- Display information of all geo-replication sessions belonging to a + master + + `# gluster volume geo-replication MASTER status` + + For example, to display information of Volume1 + + # gluster volume geo-replication Volume1 example.com:/data/remote_dir status + + MASTER SLAVE STATUS + ______ ______________________________ ____________ + Volume1 ssh://example.com:gluster://127.0.0.1:remove_volume OK + + Volume1 ssh://example.com:file:///data/remote_dir OK + + The status of a session could be one of the following four: + +- **Starting**: This is the initial phase of the Geo-replication + session; it remains in this state for a minute, to make sure no + abnormalities are present. + +- **OK**: The geo-replication session is in a stable state. + +- **Faulty**: The geo-replication session has witnessed some + abnormality and the situation has to be investigated further. For + further information, see ? section. + +- **Corrupt**: The monitor thread which is monitoring the + geo-replication session has died. This situation should not occur + normally, if it persists contact Red Hat Support[][1]. + +Configuring Geo-replication +--------------------------- + +To configure Gluster Geo-replication + +- Use the following command at the Gluster command line: + + `# gluster volume geo-replication config [options]` + + For more information about the options, see ?. + + For example: + + To view list of all option/value pair, use the following command: + + `# gluster volume geo-replication Volume1 example.com:/data/remote_dir config` + +Stopping Geo-replication +------------------------ + +You can use the gluster command to stop Gluster Geo-replication (syncing +of data from Master to Slave) in your environment. + +**To stop Gluster Geo-replication** + +- Stop geo-replication between the hosts using the following command: + + `# gluster volume geo-replication stop ` + + For example: + + # gluster volume geo-replication Volume1 example.com:/data/remote_dir stop + Stopping geo-replication session between Volume1 and + example.com:/data/remote_dir has been successful + + See ? for more information about the gluster command. + +Restoring Data from the Slave +============================= + +You can restore data from the slave to the master volume, whenever the +master volume becomes faulty for reasons like hardware failure. + +The example in this section assumes that you are using the Master Volume +(Volume1) with the following configuration: + + machine1# gluster volume info + Type: Distribute + Status: Started + Number of Bricks: 2 + Transport-type: tcp + Bricks: + Brick1: machine1:/export/dir16 + Brick2: machine2:/export/dir16 + Options Reconfigured: + geo-replication.indexing: on + +The data is syncing from master volume (Volume1) to slave directory +(example.com:/data/remote\_dir). To view the status of this +geo-replication session run the following command on Master: + + # gluster volume geo-replication Volume1 root@example.com:/data/remote_dir status + + MASTER SLAVE STATUS + ______ ______________________________ ____________ + Volume1 root@example.com:/data/remote_dir OK + +**Before Failure** + +Assume that the Master volume had 100 files and was mounted at +/mnt/gluster on one of the client machines (client). Run the following +command on Client machine to view the list of files: + + client# ls /mnt/gluster | wc –l + 100 + +The slave directory (example.com) will have same data as in the master +volume and same can be viewed by running the following command on slave: + + example.com# ls /data/remote_dir/ | wc –l + 100 + +**After Failure** + +If one of the bricks (machine2) fails, then the status of +Geo-replication session is changed from "OK" to "Faulty". To view the +status of this geo-replication session run the following command on +Master: + + # gluster volume geo-replication Volume1 root@example.com:/data/remote_dir status + + MASTER SLAVE STATUS + ______ ______________________________ ____________ + Volume1 root@example.com:/data/remote_dir Faulty + +Machine2 is failed and now you can see discrepancy in number of files +between master and slave. Few files will be missing from the master +volume but they will be available only on slave as shown below. + +Run the following command on Client: + + client # ls /mnt/gluster | wc –l + 52 + +Run the following command on slave (example.com): + + Example.com# # ls /data/remote_dir/ | wc –l + 100 + +**To restore data from the slave machine** + +1. Stop all Master's geo-replication sessions using the following + command: + + `# gluster volume geo-replication stop` + + For example: + + machine1# gluster volume geo-replication Volume1 + example.com:/data/remote_dir stop + + Stopping geo-replication session between Volume1 & + example.com:/data/remote_dir has been successful + + > **Note** + > + > Repeat `# gluster volume geo-replication stop `command on all + > active geo-replication sessions of master volume. + +2. Replace the faulty brick in the master by using the following + command: + + `# gluster volume replace-brick start` + + For example: + + machine1# gluster volume replace-brick Volume1 machine2:/export/dir16 machine3:/export/dir16 start + Replace-brick started successfully + +3. Commit the migration of data using the following command: + + `# gluster volume replace-brick commit force ` + + For example: + + machine1# gluster volume replace-brick Volume1 machine2:/export/dir16 machine3:/export/dir16 commit force + Replace-brick commit successful + +4. Verify the migration of brick by viewing the volume info using the + following command: + + `# gluster volume info ` + + For example: + + machine1# gluster volume info + Volume Name: Volume1 + Type: Distribute + Status: Started + Number of Bricks: 2 + Transport-type: tcp + Bricks: + Brick1: machine1:/export/dir16 + Brick2: machine3:/export/dir16 + Options Reconfigured: + geo-replication.indexing: on + +5. Run rsync command manually to sync data from slave to master + volume's client (mount point). + + For example: + + `example.com# rsync -PavhS --xattrs --ignore-existing /data/remote_dir/ client:/mnt/gluster` + + Verify that the data is synced by using the following command: + + On master volume, run the following command: + + Client # ls | wc –l + 100 + + On the Slave run the following command: + + example.com# ls /data/remote_dir/ | wc –l + 100 + + Now Master volume and Slave directory is synced. + +6. Restart geo-replication session from master to slave using the + following command: + + `# gluster volume geo-replication start ` + + For example: + + machine1# gluster volume geo-replication Volume1 + example.com:/data/remote_dir start + Starting geo-replication session between Volume1 & + example.com:/data/remote_dir has been successful + +Best Practices +============== + +**Manually Setting Time** + +If you have to change the time on your bricks manually, then you must +set uniform time on all bricks. This avoids the out-of-time sync issue +described in ?. Setting time backward corrupts the geo-replication +index, so the recommended way to set the time manually is: + +1. Stop geo-replication between the master and slave using the + following command: + + `# gluster volume geo-replication sto`p + +2. Stop the geo-replication indexing using the following command: + + `# gluster volume set geo-replication.indexing of`f + +3. Set uniform time on all bricks.s + +4. Restart your geo-replication sessions by using the following + command: + + `# gluster volume geo-replication start ` + +**Running Geo-replication commands in one system** + +It is advisable to run the geo-replication commands in one of the bricks +in the trusted storage pool. This is because, the log files for the +geo-replication session would be stored in the \*Server\* where the +Geo-replication start is initiated. Hence it would be easier to locate +the log-files when required. + +**Isolation** + +Geo-replication slave operation is not sandboxed as of now and is ran as +a privileged service. So for the security reason, it is advised to +create a sandbox environment (dedicated machine / dedicated virtual +machine / chroot/container type solution) by the administrator to run +the geo-replication slave in it. Enhancement in this regard will be +available in follow-up minor release. + + [ Geo-replication over LAN ]: images/Geo-Rep_LAN.png + [ Geo-replication over WAN ]: images/Geo-Rep_WAN.png + [ Geo-replication over Internet ]: images/Geo-Rep03_Internet.png + [ Multi-site cascading Geo-replication ]: images/Geo-Rep04_Cascading.png + []: http://docs.redhat.com/docs/en-US/Red_Hat_Enterprise_Linux/6/html/Migration_Planning_Guide/ch04s07.html + [1]: www.redhat.com/support/ diff --git a/doc/admin-guide/en-US/markdown/admin_managing_snapshots.md b/doc/admin-guide/en-US/markdown/admin_managing_snapshots.md new file mode 100644 index 000000000..e76ee9151 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_managing_snapshots.md @@ -0,0 +1,66 @@ +Managing GlusterFS Volume Snapshots +========================== + +This section describes how to perform common GlusterFS volume snapshot +management operations + +Pre-requisites +===================== + +GlusterFS volume snapshot feature is based on thinly provisioned LVM snapshot. +To make use of snapshot feature GlusterFS volume should fulfill following +pre-requisites: + +* Each brick should be on an independent thinly provisioned LVM. +* Brick LVM should not contain any other data other than brick. +* None of the brick should be on a thick LVM. + + +Snapshot Management +===================== + + +**Snapshot creation** + +*gluster snapshot create \<vol-name\> \[-n \<snap-name\>\] \[-d \<description\>\]* + +This command will create a snapshot of a GlusterFS volume. User can provide a snap-name and a description to identify the snap. The description cannot be more than 1024 characters. + +Volume should be present and it should be in started state. + +**Restoring snaps** + +*gluster snapshot restore -v \<vol-name\> \<snap-name\>* + +This command restores an already taken snapshot of a GlusterFS volume. Snapshot restore is an offline activity therefore if the volume is online then the restore operation will fail. + +Once the snapshot is restored it will be deleted from the list of snapshot. + +**Deleting snaps** + +*gluster snapshot delete \<volname\>\ -s \<snap-name\> \[force\]* + +This command will delete the specified snapshot. + +**Listing of available snaps** + +*gluster snapshot list \[\<volname\> \[-s \<snap-name>\]\]* + +This command is used to list all snapshots taken, or for a specified volume. If snap-name is provided then it will list the details of that snap. + +**Configuring the snapshot behavior** + +*gluster snapshot config \[\<vol-name | all\>\]* + +This command will display existing config values for a volume. If volume name is not provided then config values of all the volume is displayed. System config is displayed irrespective of volume name. + +*gluster snapshot config \<vol-name | all\> \[\<snap-max-hard-limit\> \<count\>\] \[\<snap-max-soft-limit\> \<percentage\>\]* + +The above command can be used to change the existing config values. If vol-name is provided then config value of that volume is changed, else it will set/change the system limit. + +The system limit is the default value of the config for all the volume. Volume specific limit cannot cross the system limit. If a volume specific limit is not provided then system limit will be considered. + +If any of this limit is decreased and the current snap count of the system/volume is more than the limit then the command will fail. If user still want to decrease the limit then force option should be used. + + + diff --git a/doc/admin-guide/en-US/markdown/admin_managing_volumes.md b/doc/admin-guide/en-US/markdown/admin_managing_volumes.md new file mode 100644 index 000000000..6c06e27a0 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_managing_volumes.md @@ -0,0 +1,710 @@ +Managing GlusterFS Volumes +========================== + +This section describes how to perform common GlusterFS management +operations, including the following: + +- ? + +- ? + +- ? + +- ? + +- ? + +- ? + +- ? + +- ? + +Tuning Volume Options +===================== + +You can tune volume options, as needed, while the cluster is online and +available. + +> **Note** +> +> Red Hat recommends you to set server.allow-insecure option to ON if +> there are too many bricks in each volume or if there are too many +> services which have already utilized all the privileged ports in the +> system. Turning this option ON allows ports to accept/reject messages +> from insecure ports. So, use this option only if your deployment +> requires it. + +To tune volume options + +- Tune volume options using the following command: + + `# gluster volume set ` + + For example, to specify the performance cache size for test-volume: + + # gluster volume set test-volume performance.cache-size 256MB + Set volume successful + + The following table lists the Volume options along with its + description and default value: + + > **Note** + > + > The default options given here are subject to modification at any + > given time and may not be the same for all versions. + + ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Option Description Default Value Available Options + -------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ---------------------------------- --------------------------------------------------------------------------------------- + auth.allow IP addresses of the clients which should be allowed to access the volume. \* (allow all) Valid IP address which includes wild card patterns including \*, such as 192.168.1.\* + + auth.reject IP addresses of the clients which should be denied to access the volume. NONE (reject none) Valid IP address which includes wild card patterns including \*, such as 192.168.2.\* + + client.grace-timeout Specifies the duration for the lock state to be maintained on the client after a network disconnection. 10 10 - 1800 secs + + cluster.self-heal-window-size Specifies the maximum number of blocks per file on which self-heal would happen simultaneously. 16 0 - 1025 blocks + + cluster.data-self-heal-algorithm Specifies the type of self-heal. If you set the option as "full", the entire file is copied from source to destinations. If the option is set to "diff" the file blocks that are not in sync are copied to destinations. Reset uses a heuristic model. If the file does not exist on one of the subvolumes, or a zero-byte file exists (created by entry self-heal) the entire content has to be copied anyway, so there is no benefit from using the "diff" algorithm. If the file size is about the same as page size, the entire file can be read and written with a few operations, which will be faster than "diff" which has to read checksums and then read and write. reset full | diff | reset + + cluster.min-free-disk Specifies the percentage of disk space that must be kept free. Might be useful for non-uniform bricks. 10% Percentage of required minimum free disk space + + cluster.stripe-block-size Specifies the size of the stripe unit that will be read from or written to. 128 KB (for all files) size in bytes + + cluster.self-heal-daemon Allows you to turn-off proactive self-heal on replicated volumes. on On | Off + + cluster.ensure-durability This option makes sure the data/metadata is durable across abrupt shutdown of the brick. on On | Off + + diagnostics.brick-log-level Changes the log-level of the bricks. INFO DEBUG|WARNING|ERROR|CRITICAL|NONE|TRACE + + diagnostics.client-log-level Changes the log-level of the clients. INFO DEBUG|WARNING|ERROR|CRITICAL|NONE|TRACE + + diagnostics.latency-measurement Statistics related to the latency of each operation would be tracked. off On | Off + + diagnostics.dump-fd-stats Statistics related to file-operations would be tracked. off On | Off + + feature.read-only Enables you to mount the entire volume as read-only for all the clients (including NFS clients) accessing it. off On | Off + + features.lock-heal Enables self-healing of locks when the network disconnects. on On | Off + + features.quota-timeout For performance reasons, quota caches the directory sizes on client. You can set timeout indicating the maximum duration of directory sizes in cache, from the time they are populated, during which they are considered valid. 0 0 - 3600 secs + + geo-replication.indexing Use this option to automatically sync the changes in the filesystem from Master to Slave. off On | Off + + network.frame-timeout The time frame after which the operation has to be declared as dead, if the server does not respond for a particular operation. 1800 (30 mins) 1800 secs + + network.ping-timeout The time duration for which the client waits to check if the server is responsive. When a ping timeout happens, there is a network disconnect between the client and server. All resources held by server on behalf of the client get cleaned up. When a reconnection happens, all resources will need to be re-acquired before the client can resume its operations on the server. Additionally, the locks will be acquired and the lock tables updated. 42 Secs 42 Secs + This reconnect is a very expensive operation and should be avoided. + + nfs.enable-ino32 For 32-bit nfs clients or applications that do not support 64-bit inode numbers or large files, use this option from the CLI to make Gluster NFS return 32-bit inode numbers instead of 64-bit inode numbers. Applications that will benefit are those that were either: off On | Off + \* Built 32-bit and run on 32-bit machines. + + \* Built 32-bit on 64-bit systems. + + \* Built 64-bit but use a library built 32-bit, especially relevant for python and perl scripts. + + Either of the conditions above can lead to application on Linux NFS clients failing with "Invalid argument" or "Value too large for defined data type" errors. + + nfs.volume-access Set the access type for the specified sub-volume. read-write read-write|read-only + + nfs.trusted-write If there is an UNSTABLE write from the client, STABLE flag will be returned to force the client to not send a COMMIT request. off On | Off + In some environments, combined with a replicated GlusterFS setup, this option can improve write performance. This flag allows users to trust Gluster replication logic to sync data to the disks and recover when required. COMMIT requests if received will be handled in a default manner by fsyncing. STABLE writes are still handled in a sync manner. + + nfs.trusted-sync All writes and COMMIT requests are treated as async. This implies that no write requests are guaranteed to be on server disks when the write reply is received at the NFS client. Trusted sync includes trusted-write behavior. off On | Off + + nfs.export-dir This option can be used to export specified comma separated subdirectories in the volume. The path must be an absolute path. Along with path allowed list of IPs/hostname can be associated with each subdirectory. If provided connection will allowed only from these IPs. Format: \<dir\>[(hostspec[|hostspec|...])][,...]. Where hostspec can be an IP address, hostname or an IP range in CIDR notation. **Note**: Care must be taken while configuring this option as invalid entries and/or unreachable DNS servers can introduce unwanted delay in all the mount calls. No sub directory exported. Absolute path with allowed list of IP/hostname. + + nfs.export-volumes Enable/Disable exporting entire volumes, instead if used in conjunction with nfs3.export-dir, can allow setting up only subdirectories as exports. on On | Off + + nfs.rpc-auth-unix Enable/Disable the AUTH\_UNIX authentication type. This option is enabled by default for better interoperability. However, you can disable it if required. on On | Off + + nfs.rpc-auth-null Enable/Disable the AUTH\_NULL authentication type. It is not recommended to change the default value for this option. on On | Off + + nfs.rpc-auth-allow\<IP- Addresses\> Allow a comma separated list of addresses and/or hostnames to connect to the server. By default, all clients are disallowed. This allows you to define a general rule for all exported volumes. Reject All IP address or Host name + + nfs.rpc-auth-reject IP- Addresses Reject a comma separated list of addresses and/or hostnames from connecting to the server. By default, all connections are disallowed. This allows you to define a general rule for all exported volumes. Reject All IP address or Host name + + nfs.ports-insecure Allow client connections from unprivileged ports. By default only privileged ports are allowed. This is a global setting in case insecure ports are to be enabled for all exports using a single option. off On | Off + + nfs.addr-namelookup Turn-off name lookup for incoming client connections using this option. In some setups, the name server can take too long to reply to DNS queries resulting in timeouts of mount requests. Use this option to turn off name lookups during address authentication. Note, turning this off will prevent you from using hostnames in rpc-auth.addr.\* filters. on On | Off + + nfs.register-with- portmap For systems that need to run multiple NFS servers, you need to prevent more than one from registering with portmap service. Use this option to turn off portmap registration for Gluster NFS. on On | Off + + nfs.port \<PORT- NUMBER\> Use this option on systems that need Gluster NFS to be associated with a non-default port number. 38465- 38467 + + nfs.disable Turn-off volume being exported by NFS off On | Off + + performance.write-behind-window-size Size of the per-file write-behind buffer. 1 MB Write-behind cache size + + performance.io-thread-count The number of threads in IO threads translator. 16 0 - 65 + + performance.flush-behind If this option is set ON, instructs write-behind translator to perform flush in background, by returning success (or any errors, if any of previous writes were failed) to application even before flush is sent to backend filesystem. On On | Off + + performance.cache-max-file-size Sets the maximum file size cached by the io-cache translator. Can use the normal size descriptors of KB, MB, GB,TB or PB (for example, 6GB). Maximum size uint64. 2 \^ 64 -1 bytes size in bytes + + performance.cache-min-file-size Sets the minimum file size cached by the io-cache translator. Values same as "max" above. 0B size in bytes + + performance.cache-refresh-timeout The cached data for a file will be retained till 'cache-refresh-timeout' seconds, after which data re-validation is performed. 1 sec 0 - 61 + + performance.cache-size Size of the read cache. 32 MB size in bytes + + server.allow-insecure Allow client connections from unprivileged ports. By default only privileged ports are allowed. This is a global setting in case insecure ports are to be enabled for all exports using a single option. on On | Off + + server.grace-timeout Specifies the duration for the lock state to be maintained on the server after a network disconnection. 10 10 - 1800 secs + + server.statedump-path Location of the state dump file. /tmp directory of the brick New directory path + + storage.health-check-interval Number of seconds between health-checks done on the filesystem that is used for the brick(s). Defaults to 30 seconds, set to 0 to disable. /tmp directory of the brick New directory path + ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + You can view the changed volume options using + the` # gluster volume info ` command. For more information, see ?. + +Expanding Volumes +================= + +You can expand volumes, as needed, while the cluster is online and +available. For example, you might want to add a brick to a distributed +volume, thereby increasing the distribution and adding to the capacity +of the GlusterFS volume. + +Similarly, you might want to add a group of bricks to a distributed +replicated volume, increasing the capacity of the GlusterFS volume. + +> **Note** +> +> When expanding distributed replicated and distributed striped volumes, +> you need to add a number of bricks that is a multiple of the replica +> or stripe count. For example, to expand a distributed replicated +> volume with a replica count of 2, you need to add bricks in multiples +> of 2 (such as 4, 6, 8, etc.). + +**To expand a volume** + +1. On the first server in the cluster, probe the server to which you + want to add the new brick using the following command: + + `# gluster peer probe ` + + For example: + + # gluster peer probe server4 + Probe successful + +2. Add the brick using the following command: + + `# gluster volume add-brick ` + + For example: + + # gluster volume add-brick test-volume server4:/exp4 + Add Brick successful + +3. Check the volume information using the following command: + + `# gluster volume info ` + + The command displays information similar to the following: + + Volume Name: test-volume + Type: Distribute + Status: Started + Number of Bricks: 4 + Bricks: + Brick1: server1:/exp1 + Brick2: server2:/exp2 + Brick3: server3:/exp3 + Brick4: server4:/exp4 + +4. Rebalance the volume to ensure that all files are distributed to the + new brick. + + You can use the rebalance command as described in ?. + +Shrinking Volumes +================= + +You can shrink volumes, as needed, while the cluster is online and +available. For example, you might need to remove a brick that has become +inaccessible in a distributed volume due to hardware or network failure. + +> **Note** +> +> Data residing on the brick that you are removing will no longer be +> accessible at the Gluster mount point. Note however that only the +> configuration information is removed - you can continue to access the +> data directly from the brick, as necessary. + +When shrinking distributed replicated and distributed striped volumes, +you need to remove a number of bricks that is a multiple of the replica +or stripe count. For example, to shrink a distributed striped volume +with a stripe count of 2, you need to remove bricks in multiples of 2 +(such as 4, 6, 8, etc.). In addition, the bricks you are trying to +remove must be from the same sub-volume (the same replica or stripe +set). + +**To shrink a volume** + +1. Remove the brick using the following command: + + `# gluster volume remove-brick ` `start` + + For example, to remove server2:/exp2: + + # gluster volume remove-brick test-volume server2:/exp2 + + Removing brick(s) can result in data loss. Do you want to Continue? (y/n) + +2. Enter "y" to confirm the operation. The command displays the + following message indicating that the remove brick operation is + successfully started: + + Remove Brick successful + +3. (Optional) View the status of the remove brick operation using the + following command: + + `# gluster volume remove-brick `` status` + + For example, to view the status of remove brick operation on + server2:/exp2 brick: + + # gluster volume remove-brick test-volume server2:/exp2 status + Node Rebalanced-files size scanned status + --------- ---------------- ---- ------- ----------- + 617c923e-6450-4065-8e33-865e28d9428f 34 340 162 in progress + +4. Check the volume information using the following command: + + `# gluster volume info ` + + The command displays information similar to the following: + + # gluster volume info + Volume Name: test-volume + Type: Distribute + Status: Started + Number of Bricks: 3 + Bricks: + Brick1: server1:/exp1 + Brick3: server3:/exp3 + Brick4: server4:/exp4 + +5. Rebalance the volume to ensure that all files are distributed to the + new brick. + + You can use the rebalance command as described in ?. + +Migrating Volumes +================= + +You can migrate the data from one brick to another, as needed, while the +cluster is online and available. + +**To migrate a volume** + +1. Make sure the new brick, server5 in this example, is successfully + added to the cluster. + + For more information, see ?. + +2. Migrate the data from one brick to another using the following + command: + + ` # gluster volume replace-brick start` + + For example, to migrate the data in server3:/exp3 to server5:/exp5 + in test-volume: + + # gluster volume replace-brick test-volume server3:/exp3 server5:exp5 start + Replace brick start operation successful + + > **Note** + > + > You need to have the FUSE package installed on the server on which + > you are running the replace-brick command for the command to work. + +3. To pause the migration operation, if needed, use the following + command: + + `# gluster volume replace-brick pause ` + + For example, to pause the data migration from server3:/exp3 to + server5:/exp5 in test-volume: + + # gluster volume replace-brick test-volume server3:/exp3 server5:exp5 pause + Replace brick pause operation successful + +4. To abort the migration operation, if needed, use the following + command: + + ` # gluster volume replace-brick abort ` + + For example, to abort the data migration from server3:/exp3 to + server5:/exp5 in test-volume: + + # gluster volume replace-brick test-volume server3:/exp3 server5:exp5 abort + Replace brick abort operation successful + +5. Check the status of the migration operation using the following + command: + + ` # gluster volume replace-brick status ` + + For example, to check the data migration status from server3:/exp3 + to server5:/exp5 in test-volume: + + # gluster volume replace-brick test-volume server3:/exp3 server5:/exp5 status + Current File = /usr/src/linux-headers-2.6.31-14/block/Makefile + Number of files migrated = 10567 + Migration complete + + The status command shows the current file being migrated along with + the current total number of files migrated. After completion of + migration, it displays Migration complete. + +6. Commit the migration of data from one brick to another using the + following command: + + ` # gluster volume replace-brick commit ` + + For example, to commit the data migration from server3:/exp3 to + server5:/exp5 in test-volume: + + # gluster volume replace-brick test-volume server3:/exp3 server5:/exp5 commit + replace-brick commit successful + +7. Verify the migration of brick by viewing the volume info using the + following command: + + `# gluster volume info ` + + For example, to check the volume information of new brick + server5:/exp5 in test-volume: + + # gluster volume info test-volume + Volume Name: testvolume + Type: Replicate + Status: Started + Number of Bricks: 4 + Transport-type: tcp + Bricks: + Brick1: server1:/exp1 + Brick2: server2:/exp2 + Brick3: server4:/exp4 + Brick4: server5:/exp5 + + The new volume details are displayed. + + The new volume details are displayed. + + In the above example, previously, there were bricks; 1,2,3, and 4 + and now brick 3 is replaced by brick 5. + +Rebalancing Volumes +=================== + +After expanding or shrinking a volume (using the add-brick and +remove-brick commands respectively), you need to rebalance the data +among the servers. New directories created after expanding or shrinking +of the volume will be evenly distributed automatically. For all the +existing directories, the distribution can be fixed by rebalancing the +layout and/or data. + +This section describes how to rebalance GlusterFS volumes in your +storage environment, using the following common scenarios: + +- Fix Layout - Fixes the layout changes so that the files can actually + go to newly added nodes. For more information, see ?. + +- Fix Layout and Migrate Data - Rebalances volume by fixing the layout + changes and migrating the existing data. For more information, see + ?. + +Rebalancing Volume to Fix Layout Changes +---------------------------------------- + +Fixing the layout is necessary because the layout structure is static +for a given directory. In a scenario where new bricks have been added to +the existing volume, newly created files in existing directories will +still be distributed only among the old bricks. The +`# gluster volume rebalance fix-layout start `command will fix the +layout information so that the files can also go to newly added nodes. +When this command is issued, all the file stat information which is +already cached will get revalidated. + +A fix-layout rebalance will only fix the layout changes and does not +migrate data. If you want to migrate the existing data, +use`# gluster volume rebalance start ` command to rebalance data among +the servers. + +**To rebalance a volume to fix layout changes** + +- Start the rebalance operation on any one of the server using the + following command: + + `# gluster volume rebalance fix-layout start` + + For example: + + # gluster volume rebalance test-volume fix-layout start + Starting rebalance on volume test-volume has been successful + +Rebalancing Volume to Fix Layout and Migrate Data +------------------------------------------------- + +After expanding or shrinking a volume (using the add-brick and +remove-brick commands respectively), you need to rebalance the data +among the servers. + +**To rebalance a volume to fix layout and migrate the existing data** + +- Start the rebalance operation on any one of the server using the + following command: + + `# gluster volume rebalance start` + + For example: + + # gluster volume rebalance test-volume start + Starting rebalancing on volume test-volume has been successful + +- Start the migration operation forcefully on any one of the server + using the following command: + + `# gluster volume rebalance start force` + + For example: + + # gluster volume rebalance test-volume start force + Starting rebalancing on volume test-volume has been successful + +Displaying Status of Rebalance Operation +---------------------------------------- + +You can display the status information about rebalance volume operation, +as needed. + +**To view status of rebalance volume** + +- Check the status of the rebalance operation, using the following + command: + + `# gluster volume rebalance status` + + For example: + + # gluster volume rebalance test-volume status + Node Rebalanced-files size scanned status + --------- ---------------- ---- ------- ----------- + 617c923e-6450-4065-8e33-865e28d9428f 416 1463 312 in progress + + The time to complete the rebalance operation depends on the number + of files on the volume along with the corresponding file sizes. + Continue checking the rebalance status, verifying that the number of + files rebalanced or total files scanned keeps increasing. + + For example, running the status command again might display a result + similar to the following: + + # gluster volume rebalance test-volume status + Node Rebalanced-files size scanned status + --------- ---------------- ---- ------- ----------- + 617c923e-6450-4065-8e33-865e28d9428f 498 1783 378 in progress + + The rebalance status displays the following when the rebalance is + complete: + + # gluster volume rebalance test-volume status + Node Rebalanced-files size scanned status + --------- ---------------- ---- ------- ----------- + 617c923e-6450-4065-8e33-865e28d9428f 502 1873 334 completed + +Stopping Rebalance Operation +---------------------------- + +You can stop the rebalance operation, as needed. + +**To stop rebalance** + +- Stop the rebalance operation using the following command: + + `# gluster volume rebalance stop` + + For example: + + # gluster volume rebalance test-volume stop + Node Rebalanced-files size scanned status + --------- ---------------- ---- ------- ----------- + 617c923e-6450-4065-8e33-865e28d9428f 59 590 244 stopped + Stopped rebalance process on volume test-volume + +Stopping Volumes +================ + +To stop a volume + +1. Stop the volume using the following command: + + `# gluster volume stop ` + + For example, to stop test-volume: + + # gluster volume stop test-volume + Stopping volume will make its data inaccessible. Do you want to continue? (y/n) + +2. Enter `y` to confirm the operation. The output of the command + displays the following: + + Stopping volume test-volume has been successful + +Deleting Volumes +================ + +To delete a volume + +1. Delete the volume using the following command: + + `# gluster volume delete ` + + For example, to delete test-volume: + + # gluster volume delete test-volume + Deleting volume will erase all information about the volume. Do you want to continue? (y/n) + +2. Enter `y` to confirm the operation. The command displays the + following: + + Deleting volume test-volume has been successful + +Triggering Self-Heal on Replicate +================================= + +In replicate module, previously you had to manually trigger a self-heal +when a brick goes offline and comes back online, to bring all the +replicas in sync. Now the pro-active self-heal daemon runs in the +background, diagnoses issues and automatically initiates self-healing +every 10 minutes on the files which requires*healing*. + +You can view the list of files that need *healing*, the list of files +which are currently/previously *healed*, list of files which are in +split-brain state, and you can manually trigger self-heal on the entire +volume or only on the files which need *healing*. + +- Trigger self-heal only on the files which requires *healing*: + + `# gluster volume heal ` + + For example, to trigger self-heal on files which requires *healing* + of test-volume: + + # gluster volume heal test-volume + Heal operation on volume test-volume has been successful + +- Trigger self-heal on all the files of a volume: + + `# gluster volume heal ` `full` + + For example, to trigger self-heal on all the files of of + test-volume: + + # gluster volume heal test-volume full + Heal operation on volume test-volume has been successful + +- View the list of files that needs *healing*: + + `# gluster volume heal ` `info` + + For example, to view the list of files on test-volume that needs + *healing*: + + # gluster volume heal test-volume info + Brick :/gfs/test-volume_0 + Number of entries: 0 + + Brick :/gfs/test-volume_1 + Number of entries: 101 + /95.txt + /32.txt + /66.txt + /35.txt + /18.txt + /26.txt + /47.txt + /55.txt + /85.txt + ... + +- View the list of files that are self-healed: + + `# gluster volume heal ` `info healed` + + For example, to view the list of files on test-volume that are + self-healed: + + # gluster volume heal test-volume info healed + Brick :/gfs/test-volume_0 + Number of entries: 0 + + Brick :/gfs/test-volume_1 + Number of entries: 69 + /99.txt + /93.txt + /76.txt + /11.txt + /27.txt + /64.txt + /80.txt + /19.txt + /41.txt + /29.txt + /37.txt + /46.txt + ... + +- View the list of files of a particular volume on which the self-heal + failed: + + `# gluster volume heal ` `info failed` + + For example, to view the list of files of test-volume that are not + self-healed: + + # gluster volume heal test-volume info failed + Brick :/gfs/test-volume_0 + Number of entries: 0 + + Brick server2:/gfs/test-volume_3 + Number of entries: 72 + /90.txt + /95.txt + /77.txt + /71.txt + /87.txt + /24.txt + ... + +- View the list of files of a particular volume which are in + split-brain state: + + `# gluster volume heal ` `info split-brain` + + For example, to view the list of files of test-volume which are in + split-brain state: + + # gluster volume heal test-volume info split-brain + Brick server1:/gfs/test-volume_2 + Number of entries: 12 + /83.txt + /28.txt + /69.txt + ... + + Brick :/gfs/test-volume_2 + Number of entries: 12 + /83.txt + /28.txt + /69.txt + ... + + diff --git a/doc/admin-guide/en-US/markdown/admin_monitoring_workload.md b/doc/admin-guide/en-US/markdown/admin_monitoring_workload.md new file mode 100644 index 000000000..0312bd048 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_monitoring_workload.md @@ -0,0 +1,931 @@ +Monitoring your GlusterFS Workload +================================== + +You can monitor the GlusterFS volumes on different parameters. +Monitoring volumes helps in capacity planning and performance tuning +tasks of the GlusterFS volume. Using these information, you can identify +and troubleshoot issues. + +You can use Volume Top and Profile commands to view the performance and +identify bottlenecks/hotspots of each brick of a volume. This helps +system administrators to get vital performance information whenever +performance needs to be probed. + +You can also perform statedump of the brick processes and nfs server +process of a volume, and also view volume status and volume information. + +Running GlusterFS Volume Profile Command +======================================== + +GlusterFS Volume Profile command provides an interface to get the +per-brick I/O information for each File Operation (FOP) of a volume. The +per brick information helps in identifying bottlenecks in the storage +system. + +This section describes how to run GlusterFS Volume Profile command by +performing the following operations: + +- ? + +- ? + +- ? + +Start Profiling +--------------- + +You must start the Profiling to view the File Operation information for +each brick. + +**To start profiling:** + +- Start profiling using the following command: + +`# gluster volume profile start ` + +For example, to start profiling on test-volume: + + # gluster volume profile test-volume start + Profiling started on test-volume + +When profiling on the volume is started, the following additional +options are displayed in the Volume Info: + + diagnostics.count-fop-hits: on + + diagnostics.latency-measurement: on + +Displaying the I/0 Information +------------------------------ + +You can view the I/O information of each brick. + +To display I/O information: + +- Display the I/O information using the following command: + +`# gluster volume profile info` + +For example, to see the I/O information on test-volume: + + # gluster volume profile test-volume info + Brick: Test:/export/2 + Cumulative Stats: + + Block 1b+ 32b+ 64b+ + Size: + Read: 0 0 0 + Write: 908 28 8 + + Block 128b+ 256b+ 512b+ + Size: + Read: 0 6 4 + Write: 5 23 16 + + Block 1024b+ 2048b+ 4096b+ + Size: + Read: 0 52 17 + Write: 15 120 846 + + Block 8192b+ 16384b+ 32768b+ + Size: + Read: 52 8 34 + Write: 234 134 286 + + Block 65536b+ 131072b+ + Size: + Read: 118 622 + Write: 1341 594 + + + %-latency Avg- Min- Max- calls Fop + latency Latency Latency + ___________________________________________________________ + 4.82 1132.28 21.00 800970.00 4575 WRITE + 5.70 156.47 9.00 665085.00 39163 READDIRP + 11.35 315.02 9.00 1433947.00 38698 LOOKUP + 11.88 1729.34 21.00 2569638.00 7382 FXATTROP + 47.35 104235.02 2485.00 7789367.00 488 FSYNC + + ------------------ + + ------------------ + + Duration : 335 + + BytesRead : 94505058 + + BytesWritten : 195571980 + +Stop Profiling +-------------- + +You can stop profiling the volume, if you do not need profiling +information anymore. + +**To stop profiling** + +- Stop profiling using the following command: + + `# gluster volume profile stop` + + For example, to stop profiling on test-volume: + + `# gluster volume profile stop` + + `Profiling stopped on test-volume` + +Running GlusterFS Volume TOP Command +==================================== + +GlusterFS Volume Top command allows you to view the glusterfs bricks’ +performance metrics like read, write, file open calls, file read calls, +file write calls, directory open calls, and directory real calls. The +top command displays up to 100 results. + +This section describes how to run and view the results for the following +GlusterFS Top commands: + +- ? + +- ? + +- ? + +- ? + +- ? + +- ? + +- ? + +Viewing Open fd Count and Maximum fd Count +------------------------------------------ + +You can view both current open fd count (list of files that are +currently the most opened and the count) on the brick and the maximum +open fd count (count of files that are the currently open and the count +of maximum number of files opened at any given point of time, since the +servers are up and running). If the brick name is not specified, then +open fd metrics of all the bricks belonging to the volume will be +displayed. + +**To view open fd count and maximum fd count:** + +- View open fd count and maximum fd count using the following command: + + `# gluster volume top open [brick ] [list-cnt ]` + + For example, to view open fd count and maximum fd count on brick + server:/export of test-volume and list top 10 open calls: + + `# gluster volume top open brick list-cnt ` + + `Brick: server:/export/dir1 ` + + `Current open fd's: 34 Max open fd's: 209 ` + + ==========Open file stats======== + + open file name + call count + + 2 /clients/client0/~dmtmp/PARADOX/ + COURSES.DB + + 11 /clients/client0/~dmtmp/PARADOX/ + ENROLL.DB + + 11 /clients/client0/~dmtmp/PARADOX/ + STUDENTS.DB + + 10 /clients/client0/~dmtmp/PWRPNT/ + TIPS.PPT + + 10 /clients/client0/~dmtmp/PWRPNT/ + PCBENCHM.PPT + + 9 /clients/client7/~dmtmp/PARADOX/ + STUDENTS.DB + + 9 /clients/client1/~dmtmp/PARADOX/ + STUDENTS.DB + + 9 /clients/client2/~dmtmp/PARADOX/ + STUDENTS.DB + + 9 /clients/client0/~dmtmp/PARADOX/ + STUDENTS.DB + + 9 /clients/client8/~dmtmp/PARADOX/ + STUDENTS.DB + +Viewing Highest File Read Calls +------------------------------- + +You can view highest read calls on each brick. If brick name is not +specified, then by default, list of 100 files will be displayed. + +**To view highest file Read calls:** + +- View highest file Read calls using the following command: + + `# gluster volume top read [brick ] [list-cnt ] ` + + For example, to view highest Read calls on brick server:/export of + test-volume: + + `# gluster volume top read brick list-cnt ` + + `Brick:` server:/export/dir1 + + ==========Read file stats======== + + read filename + call count + + 116 /clients/client0/~dmtmp/SEED/LARGE.FIL + + 64 /clients/client0/~dmtmp/SEED/MEDIUM.FIL + + 54 /clients/client2/~dmtmp/SEED/LARGE.FIL + + 54 /clients/client6/~dmtmp/SEED/LARGE.FIL + + 54 /clients/client5/~dmtmp/SEED/LARGE.FIL + + 54 /clients/client0/~dmtmp/SEED/LARGE.FIL + + 54 /clients/client3/~dmtmp/SEED/LARGE.FIL + + 54 /clients/client4/~dmtmp/SEED/LARGE.FIL + + 54 /clients/client9/~dmtmp/SEED/LARGE.FIL + + 54 /clients/client8/~dmtmp/SEED/LARGE.FIL + +Viewing Highest File Write Calls +-------------------------------- + +You can view list of files which has highest file write calls on each +brick. If brick name is not specified, then by default, list of 100 +files will be displayed. + +**To view highest file Write calls:** + +- View highest file Write calls using the following command: + + `# gluster volume top write [brick ] [list-cnt ] ` + + For example, to view highest Write calls on brick server:/export of + test-volume: + + `# gluster volume top write brick list-cnt ` + + `Brick: server:/export/dir1 ` + + ==========Write file stats======== + write call count filename + + 83 /clients/client0/~dmtmp/SEED/LARGE.FIL + + 59 /clients/client7/~dmtmp/SEED/LARGE.FIL + + 59 /clients/client1/~dmtmp/SEED/LARGE.FIL + + 59 /clients/client2/~dmtmp/SEED/LARGE.FIL + + 59 /clients/client0/~dmtmp/SEED/LARGE.FIL + + 59 /clients/client8/~dmtmp/SEED/LARGE.FIL + + 59 /clients/client5/~dmtmp/SEED/LARGE.FIL + + 59 /clients/client4/~dmtmp/SEED/LARGE.FIL + + 59 /clients/client6/~dmtmp/SEED/LARGE.FIL + + 59 /clients/client3/~dmtmp/SEED/LARGE.FIL + +Viewing Highest Open Calls on Directories +----------------------------------------- + +You can view list of files which has highest open calls on directories +of each brick. If brick name is not specified, then the metrics of all +the bricks belonging to that volume will be displayed. + +To view list of open calls on each directory + +- View list of open calls on each directory using the following + command: + + `# gluster volume top opendir [brick ] [list-cnt ] ` + + For example, to view open calls on brick server:/export/ of + test-volume: + + `# gluster volume top opendir brick list-cnt ` + + `Brick: server:/export/dir1 ` + + ==========Directory open stats======== + + Opendir count directory name + + 1001 /clients/client0/~dmtmp + + 454 /clients/client8/~dmtmp + + 454 /clients/client2/~dmtmp + + 454 /clients/client6/~dmtmp + + 454 /clients/client5/~dmtmp + + 454 /clients/client9/~dmtmp + + 443 /clients/client0/~dmtmp/PARADOX + + 408 /clients/client1/~dmtmp + + 408 /clients/client7/~dmtmp + + 402 /clients/client4/~dmtmp + +Viewing Highest Read Calls on Directory +--------------------------------------- + +You can view list of files which has highest directory read calls on +each brick. If brick name is not specified, then the metrics of all the +bricks belonging to that volume will be displayed. + +**To view list of highest directory read calls on each brick** + +- View list of highest directory read calls on each brick using the + following command: + + `# gluster volume top readdir [brick ] [list-cnt ] ` + + For example, to view highest directory read calls on brick + server:/export of test-volume: + + `# gluster volume top readdir brick list-cnt ` + + `Brick: ` + + ==========Directory readdirp stats======== + + readdirp count directory name + + 1996 /clients/client0/~dmtmp + + 1083 /clients/client0/~dmtmp/PARADOX + + 904 /clients/client8/~dmtmp + + 904 /clients/client2/~dmtmp + + 904 /clients/client6/~dmtmp + + 904 /clients/client5/~dmtmp + + 904 /clients/client9/~dmtmp + + 812 /clients/client1/~dmtmp + + 812 /clients/client7/~dmtmp + + 800 /clients/client4/~dmtmp + +Viewing List of Read Performance on each Brick +---------------------------------------------- + +You can view the read throughput of files on each brick. If brick name +is not specified, then the metrics of all the bricks belonging to that +volume will be displayed. The output will be the read throughput. + + ==========Read throughput file stats======== + + read filename Time + through + put(MBp + s) + + 2570.00 /clients/client0/~dmtmp/PWRPNT/ -2011-01-31 + TRIDOTS.POT 15:38:36.894610 + 2570.00 /clients/client0/~dmtmp/PWRPNT/ -2011-01-31 + PCBENCHM.PPT 15:38:39.815310 + 2383.00 /clients/client2/~dmtmp/SEED/ -2011-01-31 + MEDIUM.FIL 15:52:53.631499 + + 2340.00 /clients/client0/~dmtmp/SEED/ -2011-01-31 + MEDIUM.FIL 15:38:36.926198 + + 2299.00 /clients/client0/~dmtmp/SEED/ -2011-01-31 + LARGE.FIL 15:38:36.930445 + + 2259.00 /clients/client0/~dmtmp/PARADOX/ -2011-01-31 + COURSES.X04 15:38:40.549919 + + 2221.00 /clients/client0/~dmtmp/PARADOX/ -2011-01-31 + STUDENTS.VAL 15:52:53.298766 + + 2221.00 /clients/client3/~dmtmp/SEED/ -2011-01-31 + COURSES.DB 15:39:11.776780 + + 2184.00 /clients/client3/~dmtmp/SEED/ -2011-01-31 + MEDIUM.FIL 15:39:10.251764 + + 2184.00 /clients/client5/~dmtmp/WORD/ -2011-01-31 + BASEMACH.DOC 15:39:09.336572 + +This command will initiate a dd for the specified count and block size +and measures the corresponding throughput. + +**To view list of read performance on each brick** + +- View list of read performance on each brick using the following + command: + + `# gluster volume top read-perf [bs count ] [brick ] [list-cnt ]` + + For example, to view read performance on brick server:/export/ of + test-volume, 256 block size of count 1, and list count 10: + + `# gluster volume top read-perf bs 256 count 1 brick list-cnt ` + + `Brick: server:/export/dir1 256 bytes (256 B) copied, Throughput: 4.1 MB/s ` + + ==========Read throughput file stats======== + + read filename Time + through + put(MBp + s) + + 2912.00 /clients/client0/~dmtmp/PWRPNT/ -2011-01-31 + TRIDOTS.POT 15:38:36.896486 + + 2570.00 /clients/client0/~dmtmp/PWRPNT/ -2011-01-31 + PCBENCHM.PPT 15:38:39.815310 + + 2383.00 /clients/client2/~dmtmp/SEED/ -2011-01-31 + MEDIUM.FIL 15:52:53.631499 + + 2340.00 /clients/client0/~dmtmp/SEED/ -2011-01-31 + MEDIUM.FIL 15:38:36.926198 + + 2299.00 /clients/client0/~dmtmp/SEED/ -2011-01-31 + LARGE.FIL 15:38:36.930445 + + 2259.00 /clients/client0/~dmtmp/PARADOX/ -2011-01-31 + COURSES.X04 15:38:40.549919 + + 2221.00 /clients/client9/~dmtmp/PARADOX/ -2011-01-31 + STUDENTS.VAL 15:52:53.298766 + + 2221.00 /clients/client8/~dmtmp/PARADOX/ -2011-01-31 + COURSES.DB 15:39:11.776780 + + 2184.00 /clients/client3/~dmtmp/SEED/ -2011-01-31 + MEDIUM.FIL 15:39:10.251764 + + 2184.00 /clients/client5/~dmtmp/WORD/ -2011-01-31 + BASEMACH.DOC 15:39:09.336572 + + +Viewing List of Write Performance on each Brick +----------------------------------------------- + +You can view list of write throughput of files on each brick. If brick +name is not specified, then the metrics of all the bricks belonging to +that volume will be displayed. The output will be the write throughput. + +This command will initiate a dd for the specified count and block size +and measures the corresponding throughput. To view list of write +performance on each brick: + +- View list of write performance on each brick using the following + command: + + `# gluster volume top write-perf [bs count ] [brick ] [list-cnt ] ` + + For example, to view write performance on brick server:/export/ of + test-volume, 256 block size of count 1, and list count 10: + + `# gluster volume top write-perf bs 256 count 1 brick list-cnt ` + + `Brick`: server:/export/dir1 + + `256 bytes (256 B) copied, Throughput: 2.8 MB/s ` + + ==========Write throughput file stats======== + + write filename Time + throughput + (MBps) + + 1170.00 /clients/client0/~dmtmp/SEED/ -2011-01-31 + SMALL.FIL 15:39:09.171494 + + 1008.00 /clients/client6/~dmtmp/SEED/ -2011-01-31 + LARGE.FIL 15:39:09.73189 + + 949.00 /clients/client0/~dmtmp/SEED/ -2011-01-31 + MEDIUM.FIL 15:38:36.927426 + + 936.00 /clients/client0/~dmtmp/SEED/ -2011-01-31 + LARGE.FIL 15:38:36.933177 + 897.00 /clients/client5/~dmtmp/SEED/ -2011-01-31 + MEDIUM.FIL 15:39:09.33628 + + 897.00 /clients/client6/~dmtmp/SEED/ -2011-01-31 + MEDIUM.FIL 15:39:09.27713 + + 885.00 /clients/client0/~dmtmp/SEED/ -2011-01-31 + SMALL.FIL 15:38:36.924271 + + 528.00 /clients/client5/~dmtmp/SEED/ -2011-01-31 + LARGE.FIL 15:39:09.81893 + + 516.00 /clients/client6/~dmtmp/ACCESS/ -2011-01-31 + FASTENER.MDB 15:39:01.797317 + +Displaying Volume Information +============================= + +You can display information about a specific volume, or all volumes, as +needed. + +**To display volume information** + +- Display information about a specific volume using the following + command: + + `# gluster volume info ``VOLNAME` + + For example, to display information about test-volume: + + # gluster volume info test-volume + Volume Name: test-volume + Type: Distribute + Status: Created + Number of Bricks: 4 + Bricks: + Brick1: server1:/exp1 + Brick2: server2:/exp2 + Brick3: server3:/exp3 + Brick4: server4:/exp4 + +- Display information about all volumes using the following command: + + `# gluster volume info all` + + # gluster volume info all + + Volume Name: test-volume + Type: Distribute + Status: Created + Number of Bricks: 4 + Bricks: + Brick1: server1:/exp1 + Brick2: server2:/exp2 + Brick3: server3:/exp3 + Brick4: server4:/exp4 + + Volume Name: mirror + Type: Distributed-Replicate + Status: Started + Number of Bricks: 2 X 2 = 4 + Bricks: + Brick1: server1:/brick1 + Brick2: server2:/brick2 + Brick3: server3:/brick3 + Brick4: server4:/brick4 + + Volume Name: Vol + Type: Distribute + Status: Started + Number of Bricks: 1 + Bricks: + Brick: server:/brick6 + +Performing Statedump on a Volume +================================ + +Statedump is a mechanism through which you can get details of all +internal variables and state of the glusterfs process at the time of +issuing the command.You can perform statedumps of the brick processes +and nfs server process of a volume using the statedump command. The +following options can be used to determine what information is to be +dumped: + +- **mem** - Dumps the memory usage and memory pool details of the + bricks. + +- **iobuf** - Dumps iobuf details of the bricks. + +- **priv** - Dumps private information of loaded translators. + +- **callpool** - Dumps the pending calls of the volume. + +- **fd** - Dumps the open fd tables of the volume. + +- **inode** - Dumps the inode tables of the volume. + +**To display volume statedump** + +- Display statedump of a volume or NFS server using the following + command: + + `# gluster volume statedump [nfs] [all|mem|iobuf|callpool|priv|fd|inode]` + + For example, to display statedump of test-volume: + + # gluster volume statedump test-volume + Volume statedump successful + + The statedump files are created on the brick servers in the` /tmp` + directory or in the directory set using `server.statedump-path` + volume option. The naming convention of the dump file is + `<brick-path>.<brick-pid>.dump`. + +- By defult, the output of the statedump is stored at + ` /tmp/<brickname.PID.dump>` file on that particular server. Change + the directory of the statedump file using the following command: + + `# gluster volume set server.statedump-path ` + + For example, to change the location of the statedump file of + test-volume: + + # gluster volume set test-volume server.statedump-path /usr/local/var/log/glusterfs/dumps/ + Set volume successful + + You can view the changed path of the statedump file using the + following command: + + `# gluster volume info ` + +Displaying Volume Status +======================== + +You can display the status information about a specific volume, brick or +all volumes, as needed. Status information can be used to understand the +current status of the brick, nfs processes, and overall file system. +Status information can also be used to monitor and debug the volume +information. You can view status of the volume along with the following +details: + +- **detail** - Displays additional information about the bricks. + +- **clients** - Displays the list of clients connected to the volume. + +- **mem** - Displays the memory usage and memory pool details of the + bricks. + +- **inode** - Displays the inode tables of the volume. + +- **fd** - Displays the open fd (file descriptors) tables of the + volume. + +- **callpool** - Displays the pending calls of the volume. + +**To display volume status** + +- Display information about a specific volume using the following + command: + + `# gluster volume status [all| []] [detail|clients|mem|inode|fd|callpool]` + + For example, to display information about test-volume: + + # gluster volume status test-volume + STATUS OF VOLUME: test-volume + BRICK PORT ONLINE PID + -------------------------------------------------------- + arch:/export/1 24009 Y 22445 + -------------------------------------------------------- + arch:/export/2 24010 Y 22450 + +- Display information about all volumes using the following command: + + `# gluster volume status all` + + # gluster volume status all + STATUS OF VOLUME: volume-test + BRICK PORT ONLINE PID + -------------------------------------------------------- + arch:/export/4 24010 Y 22455 + + STATUS OF VOLUME: test-volume + BRICK PORT ONLINE PID + -------------------------------------------------------- + arch:/export/1 24009 Y 22445 + -------------------------------------------------------- + arch:/export/2 24010 Y 22450 + +- Display additional information about the bricks using the following + command: + + `# gluster volume status detail` + + For example, to display additional information about the bricks of + test-volume: + + # gluster volume status test-volume details + STATUS OF VOLUME: test-volume + ------------------------------------------- + Brick : arch:/export/1 + Port : 24009 + Online : Y + Pid : 16977 + File System : rootfs + Device : rootfs + Mount Options : rw + Disk Space Free : 13.8GB + Total Disk Space : 46.5GB + Inode Size : N/A + Inode Count : N/A + Free Inodes : N/A + + Number of Bricks: 1 + Bricks: + Brick: server:/brick6 + +- Display the list of clients accessing the volumes using the + following command: + + `# gluster volume status clients` + + For example, to display the list of clients connected to + test-volume: + + # gluster volume status test-volume clients + Brick : arch:/export/1 + Clients connected : 2 + Hostname Bytes Read BytesWritten + -------- --------- ------------ + 127.0.0.1:1013 776 676 + 127.0.0.1:1012 50440 51200 + +- Display the memory usage and memory pool details of the bricks using + the following command: + + `# gluster volume status mem` + + For example, to display the memory usage and memory pool details of + the bricks of test-volume: + + Memory status for volume : test-volume + ---------------------------------------------- + Brick : arch:/export/1 + Mallinfo + -------- + Arena : 434176 + Ordblks : 2 + Smblks : 0 + Hblks : 12 + Hblkhd : 40861696 + Usmblks : 0 + Fsmblks : 0 + Uordblks : 332416 + Fordblks : 101760 + Keepcost : 100400 + + Mempool Stats + ------------- + Name HotCount ColdCount PaddedSizeof AllocCount MaxAlloc + ---- -------- --------- ------------ ---------- -------- + test-volume-server:fd_t 0 16384 92 57 5 + test-volume-server:dentry_t 59 965 84 59 59 + test-volume-server:inode_t 60 964 148 60 60 + test-volume-server:rpcsvc_request_t 0 525 6372 351 2 + glusterfs:struct saved_frame 0 4096 124 2 2 + glusterfs:struct rpc_req 0 4096 2236 2 2 + glusterfs:rpcsvc_request_t 1 524 6372 2 1 + glusterfs:call_stub_t 0 1024 1220 288 1 + glusterfs:call_stack_t 0 8192 2084 290 2 + glusterfs:call_frame_t 0 16384 172 1728 6 + +- Display the inode tables of the volume using the following command: + + `# gluster volume status inode` + + For example, to display the inode tables of the test-volume: + + # gluster volume status test-volume inode + inode tables for volume test-volume + ---------------------------------------------- + Brick : arch:/export/1 + Active inodes: + GFID Lookups Ref IA type + ---- ------- --- ------- + 6f3fe173-e07a-4209-abb6-484091d75499 1 9 2 + 370d35d7-657e-44dc-bac4-d6dd800ec3d3 1 1 2 + + LRU inodes: + GFID Lookups Ref IA type + ---- ------- --- ------- + 80f98abe-cdcf-4c1d-b917-ae564cf55763 1 0 1 + 3a58973d-d549-4ea6-9977-9aa218f233de 1 0 1 + 2ce0197d-87a9-451b-9094-9baa38121155 1 0 2 + +- Display the open fd tables of the volume using the following + command: + + `# gluster volume status fd` + + For example, to display the open fd tables of the test-volume: + + # gluster volume status test-volume fd + + FD tables for volume test-volume + ---------------------------------------------- + Brick : arch:/export/1 + Connection 1: + RefCount = 0 MaxFDs = 128 FirstFree = 4 + FD Entry PID RefCount Flags + -------- --- -------- ----- + 0 26311 1 2 + 1 26310 3 2 + 2 26310 1 2 + 3 26311 3 2 + + Connection 2: + RefCount = 0 MaxFDs = 128 FirstFree = 0 + No open fds + + Connection 3: + RefCount = 0 MaxFDs = 128 FirstFree = 0 + No open fds + +- Display the pending calls of the volume using the following command: + + `# gluster volume status callpool` + + Each call has a call stack containing call frames. + + For example, to display the pending calls of test-volume: + + # gluster volume status test-volume + + Pending calls for volume test-volume + ---------------------------------------------- + Brick : arch:/export/1 + Pending calls: 2 + Call Stack1 + UID : 0 + GID : 0 + PID : 26338 + Unique : 192138 + Frames : 7 + Frame 1 + Ref Count = 1 + Translator = test-volume-server + Completed = No + Frame 2 + Ref Count = 0 + Translator = test-volume-posix + Completed = No + Parent = test-volume-access-control + Wind From = default_fsync + Wind To = FIRST_CHILD(this)->fops->fsync + Frame 3 + Ref Count = 1 + Translator = test-volume-access-control + Completed = No + Parent = repl-locks + Wind From = default_fsync + Wind To = FIRST_CHILD(this)->fops->fsync + Frame 4 + Ref Count = 1 + Translator = test-volume-locks + Completed = No + Parent = test-volume-io-threads + Wind From = iot_fsync_wrapper + Wind To = FIRST_CHILD (this)->fops->fsync + Frame 5 + Ref Count = 1 + Translator = test-volume-io-threads + Completed = No + Parent = test-volume-marker + Wind From = default_fsync + Wind To = FIRST_CHILD(this)->fops->fsync + Frame 6 + Ref Count = 1 + Translator = test-volume-marker + Completed = No + Parent = /export/1 + Wind From = io_stats_fsync + Wind To = FIRST_CHILD(this)->fops->fsync + Frame 7 + Ref Count = 1 + Translator = /export/1 + Completed = No + Parent = test-volume-server + Wind From = server_fsync_resume + Wind To = bound_xl->fops->fsync + + diff --git a/doc/admin-guide/en-US/markdown/admin_setting_volumes.md b/doc/admin-guide/en-US/markdown/admin_setting_volumes.md new file mode 100644 index 000000000..4038523c8 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_setting_volumes.md @@ -0,0 +1,419 @@ +Setting up GlusterFS Server Volumes +=================================== + +A volume is a logical collection of bricks where each brick is an export +directory on a server in the trusted storage pool. Most of the gluster +management operations are performed on the volume. + +To create a new volume in your storage environment, specify the bricks +that comprise the volume. After you have created a new volume, you must +start it before attempting to mount it. + +- Volumes of the following types can be created in your storage + environment: + + - Distributed - Distributed volumes distributes files throughout + the bricks in the volume. You can use distributed volumes where + the requirement is to scale storage and the redundancy is either + not important or is provided by other hardware/software layers. + For more information, see ? . + + - Replicated – Replicated volumes replicates files across bricks + in the volume. You can use replicated volumes in environments + where high-availability and high-reliability are critical. For + more information, see ?. + + - Striped – Striped volumes stripes data across bricks in the + volume. For best results, you should use striped volumes only in + high concurrency environments accessing very large files. For + more information, see ?. + + - Distributed Striped - Distributed striped volumes stripe data + across two or more nodes in the cluster. You should use + distributed striped volumes where the requirement is to scale + storage and in high concurrency environments accessing very + large files is critical. For more information, see ?. + + - Distributed Replicated - Distributed replicated volumes + distributes files across replicated bricks in the volume. You + can use distributed replicated volumes in environments where the + requirement is to scale storage and high-reliability is + critical. Distributed replicated volumes also offer improved + read performance in most environments. For more information, see + ?. + + - Distributed Striped Replicated – Distributed striped replicated + volumes distributes striped data across replicated bricks in the + cluster. For best results, you should use distributed striped + replicated volumes in highly concurrent environments where + parallel access of very large files and performance is critical. + In this release, configuration of this volume type is supported + only for Map Reduce workloads. For more information, see ?. + + - Striped Replicated – Striped replicated volumes stripes data + across replicated bricks in the cluster. For best results, you + should use striped replicated volumes in highly concurrent + environments where there is parallel access of very large files + and performance is critical. In this release, configuration of + this volume type is supported only for Map Reduce workloads. For + more information, see ?. + +**To create a new volume** + +- Create a new volume : + + `# gluster volume create [stripe | replica ] [transport tcp | rdma | tcp, rdma] ` + + For example, to create a volume called test-volume consisting of + server3:/exp3 and server4:/exp4: + + # gluster volume create test-volume server3:/exp3 server4:/exp4 + Creation of test-volume has been successful + Please start the volume to access data. + +Creating Distributed Volumes +============================ + +In a distributed volumes files are spread randomly across the bricks in +the volume. Use distributed volumes where you need to scale storage and +redundancy is either not important or is provided by other +hardware/software layers. + +> **Note** +> +> Disk/server failure in distributed volumes can result in a serious +> loss of data because directory contents are spread randomly across the +> bricks in the volume. + +![][] + +**To create a distributed volume** + +1. Create a trusted storage pool as described earlier in ?. + +2. Create the distributed volume: + + `# gluster volume create [transport tcp | rdma | tcp,rdma] ` + + For example, to create a distributed volume with four storage + servers using tcp: + + # gluster volume create test-volume server1:/exp1 server2:/exp2 server3:/exp3 server4:/exp4 + Creation of test-volume has been successful + Please start the volume to access data. + + (Optional) You can display the volume information: + + # gluster volume info + Volume Name: test-volume + Type: Distribute + Status: Created + Number of Bricks: 4 + Transport-type: tcp + Bricks: + Brick1: server1:/exp1 + Brick2: server2:/exp2 + Brick3: server3:/exp3 + Brick4: server4:/exp4 + + For example, to create a distributed volume with four storage + servers over InfiniBand: + + # gluster volume create test-volume transport rdma server1:/exp1 server2:/exp2 server3:/exp3 server4:/exp4 + Creation of test-volume has been successful + Please start the volume to access data. + + If the transport type is not specified, *tcp* is used as the + default. You can also set additional options if required, such as + auth.allow or auth.reject. For more information, see ? + + > **Note** + > + > Make sure you start your volumes before you try to mount them or + > else client operations after the mount will hang, see ? for + > details. + +Creating Replicated Volumes +=========================== + +Replicated volumes create copies of files across multiple bricks in the +volume. You can use replicated volumes in environments where +high-availability and high-reliability are critical. + +> **Note** +> +> The number of bricks should be equal to of the replica count for a +> replicated volume. To protect against server and disk failures, it is +> recommended that the bricks of the volume are from different servers. + +![][1] + +**To create a replicated volume** + +1. Create a trusted storage pool as described earlier in ?. + +2. Create the replicated volume: + + `# gluster volume create [replica ] [transport tcp | rdma tcp,rdma] ` + + For example, to create a replicated volume with two storage servers: + + # gluster volume create test-volume replica 2 transport tcp server1:/exp1 server2:/exp2 + Creation of test-volume has been successful + Please start the volume to access data. + + If the transport type is not specified, *tcp* is used as the + default. You can also set additional options if required, such as + auth.allow or auth.reject. For more information, see ? + + > **Note** + > + > Make sure you start your volumes before you try to mount them or + > else client operations after the mount will hang, see ? for + > details. + +Creating Striped Volumes +======================== + +Striped volumes stripes data across bricks in the volume. For best +results, you should use striped volumes only in high concurrency +environments accessing very large files. + +> **Note** +> +> The number of bricks should be a equal to the stripe count for a +> striped volume. + +![][2] + +**To create a striped volume** + +1. Create a trusted storage pool as described earlier in ?. + +2. Create the striped volume: + + `# gluster volume create [stripe ] [transport tcp | rdma | tcp,rdma] ` + + For example, to create a striped volume across two storage servers: + + # gluster volume create test-volume stripe 2 transport tcp server1:/exp1 server2:/exp2 + Creation of test-volume has been successful + Please start the volume to access data. + + If the transport type is not specified, *tcp* is used as the + default. You can also set additional options if required, such as + auth.allow or auth.reject. For more information, see ? + + > **Note** + > + > Make sure you start your volumes before you try to mount them or + > else client operations after the mount will hang, see ? for + > details. + +Creating Distributed Striped Volumes +==================================== + +Distributed striped volumes stripes files across two or more nodes in +the cluster. For best results, you should use distributed striped +volumes where the requirement is to scale storage and in high +concurrency environments accessing very large files is critical. + +> **Note** +> +> The number of bricks should be a multiple of the stripe count for a +> distributed striped volume. + +![][3] + +**To create a distributed striped volume** + +1. Create a trusted storage pool as described earlier in ?. + +2. Create the distributed striped volume: + + `# gluster volume create [stripe ] [transport tcp | rdma | tcp,rdma] ` + + For example, to create a distributed striped volume across eight + storage servers: + + # gluster volume create test-volume stripe 4 transport tcp server1:/exp1 server2:/exp2 server3:/exp3 server4:/exp4 server5:/exp5 server6:/exp6 server7:/exp7 server8:/exp8 + Creation of test-volume has been successful + Please start the volume to access data. + + If the transport type is not specified, *tcp* is used as the + default. You can also set additional options if required, such as + auth.allow or auth.reject. For more information, see ? + + > **Note** + > + > Make sure you start your volumes before you try to mount them or + > else client operations after the mount will hang, see ? for + > details. + +Creating Distributed Replicated Volumes +======================================= + +Distributes files across replicated bricks in the volume. You can use +distributed replicated volumes in environments where the requirement is +to scale storage and high-reliability is critical. Distributed +replicated volumes also offer improved read performance in most +environments. + +> **Note** +> +> The number of bricks should be a multiple of the replica count for a +> distributed replicated volume. Also, the order in which bricks are +> specified has a great effect on data protection. Each replica\_count +> consecutive bricks in the list you give will form a replica set, with +> all replica sets combined into a volume-wide distribute set. To make +> sure that replica-set members are not placed on the same node, list +> the first brick on every server, then the second brick on every server +> in the same order, and so on. + +![][4] + +**To create a distributed replicated volume** + +1. Create a trusted storage pool as described earlier in ?. + +2. Create the distributed replicated volume: + + `# gluster volume create [replica ] [transport tcp | rdma | tcp,rdma] ` + + For example, four node distributed (replicated) volume with a + two-way mirror: + + # gluster volume create test-volume replica 2 transport tcp server1:/exp1 server2:/exp2 server3:/exp3 server4:/exp4 + Creation of test-volume has been successful + Please start the volume to access data. + + For example, to create a six node distributed (replicated) volume + with a two-way mirror: + + # gluster volume create test-volume replica 2 transport tcp server1:/exp1 server2:/exp2 server3:/exp3 server4:/exp4 server5:/exp5 server6:/exp6 + Creation of test-volume has been successful + Please start the volume to access data. + + If the transport type is not specified, *tcp* is used as the + default. You can also set additional options if required, such as + auth.allow or auth.reject. For more information, see ? + + > **Note** + > + > Make sure you start your volumes before you try to mount them or + > else client operations after the mount will hang, see ? for + > details. + +Creating Distributed Striped Replicated Volumes +=============================================== + +Distributed striped replicated volumes distributes striped data across +replicated bricks in the cluster. For best results, you should use +distributed striped replicated volumes in highly concurrent environments +where parallel access of very large files and performance is critical. +In this release, configuration of this volume type is supported only for +Map Reduce workloads. + +> **Note** +> +> The number of bricks should be a multiples of number of stripe count +> and replica count for a distributed striped replicated volume. + +**To create a distributed striped replicated volume** + +1. Create a trusted storage pool as described earlier in ?. + +2. Create a distributed striped replicated volume using the following + command: + + `# gluster volume create [stripe ] [replica ] [transport tcp | rdma | tcp,rdma] ` + + For example, to create a distributed replicated striped volume + across eight storage servers: + + # gluster volume create test-volume stripe 2 replica 2 transport tcp server1:/exp1 server2:/exp2 server3:/exp3 server4:/exp4 server5:/exp5 server6:/exp6 server7:/exp7 server8:/exp8 + Creation of test-volume has been successful + Please start the volume to access data. + + If the transport type is not specified, *tcp* is used as the + default. You can also set additional options if required, such as + auth.allow or auth.reject. For more information, see ? + + > **Note** + > + > Make sure you start your volumes before you try to mount them or + > else client operations after the mount will hang, see ? for + > details. + +Creating Striped Replicated Volumes +=================================== + +Striped replicated volumes stripes data across replicated bricks in the +cluster. For best results, you should use striped replicated volumes in +highly concurrent environments where there is parallel access of very +large files and performance is critical. In this release, configuration +of this volume type is supported only for Map Reduce workloads. + +> **Note** +> +> The number of bricks should be a multiple of the replicate count and +> stripe count for a striped replicated volume. + +![][5] + +**To create a striped replicated volume** + +1. Create a trusted storage pool consisting of the storage servers that + will comprise the volume. + + For more information, see ?. + +2. Create a striped replicated volume : + + `# gluster volume create [stripe ] [replica ] [transport tcp | rdma | tcp,rdma] ` + + For example, to create a striped replicated volume across four + storage servers: + + # gluster volume create test-volume stripe 2 replica 2 transport tcp server1:/exp1 server2:/exp2 server3:/exp3 server4:/exp4 + Creation of test-volume has been successful + Please start the volume to access data. + + To create a striped replicated volume across six storage servers: + + # gluster volume create test-volume stripe 3 replica 2 transport tcp server1:/exp1 server2:/exp2 server3:/exp3 server4:/exp4 server5:/exp5 server6:/exp6 + Creation of test-volume has been successful + Please start the volume to access data. + + If the transport type is not specified, *tcp* is used as the + default. You can also set additional options if required, such as + auth.allow or auth.reject. For more information, see ? + + > **Note** + > + > Make sure you start your volumes before you try to mount them or + > else client operations after the mount will hang, see ? for + > details. + +Starting Volumes +================ + +You must start your volumes before you try to mount them. + +**To start a volume** + +- Start a volume: + + `# gluster volume start ` + + For example, to start test-volume: + + # gluster volume start test-volume + Starting test-volume has been successful + + []: images/Distributed_Volume.png + [1]: images/Replicated_Volume.png + [2]: images/Striped_Volume.png + [3]: images/Distributed_Striped_Volume.png + [4]: images/Distributed_Replicated_Volume.png + [5]: images/Striped_Replicated_Volume.png diff --git a/doc/admin-guide/en-US/markdown/admin_settingup_clients.md b/doc/admin-guide/en-US/markdown/admin_settingup_clients.md new file mode 100644 index 000000000..85b28c952 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_settingup_clients.md @@ -0,0 +1,641 @@ +Accessing Data - Setting Up GlusterFS Client +============================================ + +You can access gluster volumes in multiple ways. You can use Gluster +Native Client method for high concurrency, performance and transparent +failover in GNU/Linux clients. You can also use NFS v3 to access gluster +volumes. Extensive testing has be done on GNU/Linux clients and NFS +implementation in other operating system, such as FreeBSD, and Mac OS X, +as well as Windows 7 (Professional and Up) and Windows Server 2003. +Other NFS client implementations may work with gluster NFS server. + +You can use CIFS to access volumes when using Microsoft Windows as well +as SAMBA clients. For this access method, Samba packages need to be +present on the client side. + +Gluster Native Client +===================== + +The Gluster Native Client is a FUSE-based client running in user space. +Gluster Native Client is the recommended method for accessing volumes +when high concurrency and high write performance is required. + +This section introduces the Gluster Native Client and explains how to +install the software on client machines. This section also describes how +to mount volumes on clients (both manually and automatically) and how to +verify that the volume has mounted successfully. + +Installing the Gluster Native Client +------------------------------------ + +Before you begin installing the Gluster Native Client, you need to +verify that the FUSE module is loaded on the client and has access to +the required modules as follows: + +1. Add the FUSE loadable kernel module (LKM) to the Linux kernel: + + `# modprobe fuse` + +2. Verify that the FUSE module is loaded: + + `# dmesg | grep -i fuse ` + + `fuse init (API version 7.13)` + +### Installing on Red Hat Package Manager (RPM) Distributions + +To install Gluster Native Client on RPM distribution-based systems + +1. Install required prerequisites on the client using the following + command: + + `$ sudo yum -y install openssh-server wget fuse fuse-libs openib libibverbs` + +2. Ensure that TCP and UDP ports 24007 and 24008 are open on all + Gluster servers. Apart from these ports, you need to open one port + for each brick starting from port 24009. For example: if you have + five bricks, you need to have ports 24009 to 24013 open. + + You can use the following chains with iptables: + + `$ sudo iptables -A RH-Firewall-1-INPUT -m state --state NEW -m tcp -p tcp --dport 24007:24008 -j ACCEPT ` + + `$ sudo iptables -A RH-Firewall-1-INPUT -m state --state NEW -m tcp -p tcp --dport 24009:24014 -j ACCEPT` + + > **Note** + > + > If you already have iptable chains, make sure that the above + > ACCEPT rules precede the DROP rules. This can be achieved by + > providing a lower rule number than the DROP rule. + +3. Download the latest glusterfs, glusterfs-fuse, and glusterfs-rdma + RPM files to each client. The glusterfs package contains the Gluster + Native Client. The glusterfs-fuse package contains the FUSE + translator required for mounting on client systems and the + glusterfs-rdma packages contain OpenFabrics verbs RDMA module for + Infiniband. + + You can download the software at [][]. + +4. Install Gluster Native Client on the client. + + `$ sudo rpm -i glusterfs-3.3.0qa30-1.x86_64.rpm ` + + `$ sudo rpm -i glusterfs-fuse-3.3.0qa30-1.x86_64.rpm ` + + `$ sudo rpm -i glusterfs-rdma-3.3.0qa30-1.x86_64.rpm` + + > **Note** + > + > The RDMA module is only required when using Infiniband. + +### Installing on Debian-based Distributions + +To install Gluster Native Client on Debian-based distributions + +1. Install OpenSSH Server on each client using the following command: + + `$ sudo apt-get install openssh-server vim wget` + +2. Download the latest GlusterFS .deb file and checksum to each client. + + You can download the software at [][1]. + +3. For each .deb file, get the checksum (using the following command) + and compare it against the checksum for that file in the md5sum + file. + + `$ md5sum GlusterFS_DEB_file.deb ` + + The md5sum of the packages is available at: [][2] + +4. Uninstall GlusterFS v3.1 (or an earlier version) from the client + using the following command: + + `$ sudo dpkg -r glusterfs ` + + (Optional) Run `$ sudo dpkg -purge glusterfs `to purge the + configuration files. + +5. Install Gluster Native Client on the client using the following + command: + + `$ sudo dpkg -i GlusterFS_DEB_file ` + + For example: + + `$ sudo dpkg -i glusterfs-3.3.x.deb ` + +6. Ensure that TCP and UDP ports 24007 and 24008 are open on all + Gluster servers. Apart from these ports, you need to open one port + for each brick starting from port 24009. For example: if you have + five bricks, you need to have ports 24009 to 24013 open. + + You can use the following chains with iptables: + + `$ sudo iptables -A RH-Firewall-1-INPUT -m state --state NEW -m tcp -p tcp --dport 24007:24008 -j ACCEPT ` + + `$ sudo iptables -A RH-Firewall-1-INPUT -m state --state NEW -m tcp -p tcp --dport 24009:24014 -j ACCEPT` + + > **Note** + > + > If you already have iptable chains, make sure that the above + > ACCEPT rules precede the DROP rules. This can be achieved by + > providing a lower rule number than the DROP rule. + +### Performing a Source Installation + +To build and install Gluster Native Client from the source code + +1. Create a new directory using the following commands: + + `# mkdir glusterfs ` + + `# cd glusterfs` + +2. Download the source code. + + You can download the source at [][1]. + +3. Extract the source code using the following command: + + `# tar -xvzf SOURCE-FILE ` + +4. Run the configuration utility using the following command: + + `# ./configure ` + + `GlusterFS configure summary ` + + `================== ` + + `FUSE client : yes ` + + `Infiniband verbs : yes ` + + `epoll IO multiplex : yes ` + + `argp-standalone : no ` + + `fusermount : no ` + + `readline : yes` + + The configuration summary shows the components that will be built + with Gluster Native Client. + +5. Build the Gluster Native Client software using the following + commands: + + `# make ` + + `# make install` + +6. Verify that the correct version of Gluster Native Client is + installed, using the following command: + + `# glusterfs –-version` + +Mounting Volumes +---------------- + +After installing the Gluster Native Client, you need to mount Gluster +volumes to access data. There are two methods you can choose: + +- ? + +- ? + +After mounting a volume, you can test the mounted volume using the +procedure described in ?. + +> **Note** +> +> Server names selected during creation of Volumes should be resolvable +> in the client machine. You can use appropriate /etc/hosts entries or +> DNS server to resolve server names to IP addresses. + +### Manually Mounting Volumes + +To manually mount a Gluster volume + +- To mount a volume, use the following command: + + `# mount -t glusterfs HOSTNAME-OR-IPADDRESS:/VOLNAME MOUNTDIR` + + For example: + + `# mount -t glusterfs server1:/test-volume /mnt/glusterfs` + + > **Note** + > + > The server specified in the mount command is only used to fetch + > the gluster configuration volfile describing the volume name. + > Subsequently, the client will communicate directly with the + > servers mentioned in the volfile (which might not even include the + > one used for mount). + > + > If you see a usage message like "Usage: mount.glusterfs", mount + > usually requires you to create a directory to be used as the mount + > point. Run "mkdir /mnt/glusterfs" before you attempt to run the + > mount command listed above. + +**Mounting Options** + +You can specify the following options when using the +`mount -t glusterfs` command. Note that you need to separate all options +with commas. + +backupvolfile-server=server-name + +volfile-max-fetch-attempts=number of attempts + +log-level=loglevel + +log-file=logfile + +transport=transport-type + +direct-io-mode=[enable|disable] + +For example: + +`# mount -t glusterfs -o backupvolfile-server=volfile_server2 --volfile-max-fetch-attempts=2 log-level=WARNING,log-file=/var/log/gluster.log server1:/test-volume /mnt/glusterfs` + +If `backupvolfile-server` option is added while mounting fuse client, +when the first volfile server fails, then the server specified in +`backupvolfile-server` option is used as volfile server to mount the +client. + +In `--volfile-max-fetch-attempts=X` option, specify the number of +attempts to fetch volume files while mounting a volume. This option is +useful when you mount a server with multiple IP addresses or when +round-robin DNS is configured for the server-name.. + +### Automatically Mounting Volumes + +You can configure your system to automatically mount the Gluster volume +each time your system starts. + +The server specified in the mount command is only used to fetch the +gluster configuration volfile describing the volume name. Subsequently, +the client will communicate directly with the servers mentioned in the +volfile (which might not even include the one used for mount). + +**To automatically mount a Gluster volume** + +- To mount a volume, edit the /etc/fstab file and add the following + line: + + `HOSTNAME-OR-IPADDRESS:/VOLNAME MOUNTDIR glusterfs defaults,_netdev 0 0 ` + + For example: + + `server1:/test-volume /mnt/glusterfs glusterfs defaults,_netdev 0 0` + +**Mounting Options** + +You can specify the following options when updating the /etc/fstab file. +Note that you need to separate all options with commas. + +log-level=loglevel + +log-file=logfile + +transport=transport-type + +direct-io-mode=[enable|disable] + +For example: + +`HOSTNAME-OR-IPADDRESS:/VOLNAME MOUNTDIR glusterfs defaults,_netdev,log-level=WARNING,log-file=/var/log/gluster.log 0 0 ` + +### Testing Mounted Volumes + +To test mounted volumes + +- Use the following command: + + `# mount ` + + If the gluster volume was successfully mounted, the output of the + mount command on the client will be similar to this example: + + `server1:/test-volume on /mnt/glusterfs type fuse.glusterfs (rw,allow_other,default_permissions,max_read=131072` + +- Use the following command: + + `# df` + + The output of df command on the client will display the aggregated + storage space from all the bricks in a volume similar to this + example: + + `# df -h /mnt/glusterfs Filesystem Size Used Avail Use% Mounted on server1:/test-volume 28T 22T 5.4T 82% /mnt/glusterfs` + +- Change to the directory and list the contents by entering the + following: + + `# cd MOUNTDIR ` + + `# ls` + +- For example, + + `# cd /mnt/glusterfs ` + + `# ls` + +NFS +=== + +You can use NFS v3 to access to gluster volumes. Extensive testing has +be done on GNU/Linux clients and NFS implementation in other operating +system, such as FreeBSD, and Mac OS X, as well as Windows 7 +(Professional and Up), Windows Server 2003, and others, may work with +gluster NFS server implementation. + +GlusterFS now includes network lock manager (NLM) v4. NLM enables +applications on NFSv3 clients to do record locking on files on NFS +server. It is started automatically whenever the NFS server is run. + +You must install nfs-common package on both servers and clients (only +for Debian-based) distribution. + +This section describes how to use NFS to mount Gluster volumes (both +manually and automatically) and how to verify that the volume has been +mounted successfully. + +Using NFS to Mount Volumes +-------------------------- + +You can use either of the following methods to mount Gluster volumes: + +- ? + +- ? + +**Prerequisite**: Install nfs-common package on both servers and clients +(only for Debian-based distribution), using the following command: + +`$ sudo aptitude install nfs-common ` + +After mounting a volume, you can test the mounted volume using the +procedure described in ?. + +### Manually Mounting Volumes Using NFS + +To manually mount a Gluster volume using NFS + +- To mount a volume, use the following command: + + `# mount -t nfs -o vers=3 HOSTNAME-OR-IPADDRESS:/VOLNAME MOUNTDIR` + + For example: + + `# mount -t nfs -o vers=3 server1:/test-volume /mnt/glusterfs` + + > **Note** + > + > Gluster NFS server does not support UDP. If the NFS client you are + > using defaults to connecting using UDP, the following message + > appears: + > + > `requested NFS version or transport protocol is not supported`. + + **To connect using TCP** + +- Add the following option to the mount command: + + `-o mountproto=tcp ` + + For example: + + `# mount -o mountproto=tcp -t nfs server1:/test-volume /mnt/glusterfs` + +**To mount Gluster NFS server from a Solaris client** + +- Use the following command: + + `# mount -o proto=tcp,vers=3 nfs://HOSTNAME-OR-IPADDRESS:38467/VOLNAME MOUNTDIR` + + For example: + + ` # mount -o proto=tcp,vers=3 nfs://server1:38467/test-volume /mnt/glusterfs` + +### Automatically Mounting Volumes Using NFS + +You can configure your system to automatically mount Gluster volumes +using NFS each time the system starts. + +**To automatically mount a Gluster volume using NFS** + +- To mount a volume, edit the /etc/fstab file and add the following + line: + + `HOSTNAME-OR-IPADDRESS:/VOLNAME MOUNTDIR nfs defaults,_netdev,vers=3 0 0` + + For example, + + `server1:/test-volume /mnt/glusterfs nfs defaults,_netdev,vers=3 0 0` + + > **Note** + > + > Gluster NFS server does not support UDP. If the NFS client you are + > using defaults to connecting using UDP, the following message + > appears: + > + > `requested NFS version or transport protocol is not supported.` + + To connect using TCP + +- Add the following entry in /etc/fstab file : + + `HOSTNAME-OR-IPADDRESS:/VOLNAME MOUNTDIR nfs defaults,_netdev,mountproto=tcp 0 0` + + For example, + + `server1:/test-volume /mnt/glusterfs nfs defaults,_netdev,mountproto=tcp 0 0` + +**To automount NFS mounts** + +Gluster supports \*nix standard method of automounting NFS mounts. +Update the /etc/auto.master and /etc/auto.misc and restart the autofs +service. After that, whenever a user or process attempts to access the +directory it will be mounted in the background. + +### Testing Volumes Mounted Using NFS + +You can confirm that Gluster directories are mounting successfully. + +**To test mounted volumes** + +- Use the mount command by entering the following: + + `# mount` + + For example, the output of the mount command on the client will + display an entry like the following: + + `server1:/test-volume on /mnt/glusterfs type nfs (rw,vers=3,addr=server1)` + +- Use the df command by entering the following: + + `# df` + + For example, the output of df command on the client will display the + aggregated storage space from all the bricks in a volume. + + # df -h /mnt/glusterfs + Filesystem Size Used Avail Use% Mounted on + server1:/test-volume 28T 22T 5.4T 82% /mnt/glusterfs + +- Change to the directory and list the contents by entering the + following: + + `# cd MOUNTDIR` + + `# ls` + + For example, + + ` + + ` + + `# ls` + +CIFS +==== + +You can use CIFS to access to volumes when using Microsoft Windows as +well as SAMBA clients. For this access method, Samba packages need to be +present on the client side. You can export glusterfs mount point as the +samba export, and then mount it using CIFS protocol. + +This section describes how to mount CIFS shares on Microsoft +Windows-based clients (both manually and automatically) and how to +verify that the volume has mounted successfully. + +> **Note** +> +> CIFS access using the Mac OS X Finder is not supported, however, you +> can use the Mac OS X command line to access Gluster volumes using +> CIFS. + +Using CIFS to Mount Volumes +--------------------------- + +You can use either of the following methods to mount Gluster volumes: + +- ? + +- ? + +After mounting a volume, you can test the mounted volume using the +procedure described in ?. + +You can also use Samba for exporting Gluster Volumes through CIFS +protocol. + +### Exporting Gluster Volumes Through Samba + +We recommend you to use Samba for exporting Gluster volumes through the +CIFS protocol. + +**To export volumes through CIFS protocol** + +1. Mount a Gluster volume. For more information on mounting volumes, + see ?. + +2. Setup Samba configuration to export the mount point of the Gluster + volume. + + For example, if a Gluster volume is mounted on /mnt/gluster, you + must edit smb.conf file to enable exporting this through CIFS. Open + smb.conf file in an editor and add the following lines for a simple + configuration: + + [glustertest] + + comment = For testing a Gluster volume exported through CIFS + + path = /mnt/glusterfs + + read only = no + + guest ok = yes + +Save the changes and start the smb service using your systems init +scripts (/etc/init.d/smb [re]start). + +> **Note** +> +> To be able mount from any server in the trusted storage pool, you must +> repeat these steps on each Gluster node. For more advanced +> configurations, see Samba documentation. + +### Manually Mounting Volumes Using CIFS + +You can manually mount Gluster volumes using CIFS on Microsoft +Windows-based client machines. + +**To manually mount a Gluster volume using CIFS** + +1. Using Windows Explorer, choose **Tools \> Map Network Drive…** from + the menu. The **Map Network Drive**window appears. + +2. Choose the drive letter using the **Drive** drop-down list. + +3. Click **Browse**, select the volume to map to the network drive, and + click **OK**. + +4. Click **Finish.** + +The network drive (mapped to the volume) appears in the Computer window. + +**Alternatively, to manually mount a Gluster volume using CIFS.** + +- Click **Start \> Run** and enter the following: + + ` + + ` + + For example: + + ` + + ` + +### Automatically Mounting Volumes Using CIFS + +You can configure your system to automatically mount Gluster volumes +using CIFS on Microsoft Windows-based clients each time the system +starts. + +**To automatically mount a Gluster volume using CIFS** + +The network drive (mapped to the volume) appears in the Computer window +and is reconnected each time the system starts. + +1. Using Windows Explorer, choose **Tools \> Map Network Drive…** from + the menu. The **Map Network Drive**window appears. + +2. Choose the drive letter using the **Drive** drop-down list. + +3. Click **Browse**, select the volume to map to the network drive, and + click **OK**. + +4. Click the **Reconnect** at logon checkbox. + +5. Click **Finish.** + +### Testing Volumes Mounted Using CIFS + +You can confirm that Gluster directories are mounting successfully by +navigating to the directory using Windows Explorer. + + []: http://bits.gluster.com/gluster/glusterfs/3.3.0qa30/x86_64/ + [1]: http://www.gluster.org/download/ + [2]: http://download.gluster.com/pub/gluster/glusterfs diff --git a/doc/admin-guide/en-US/markdown/admin_start_stop_daemon.md b/doc/admin-guide/en-US/markdown/admin_start_stop_daemon.md new file mode 100644 index 000000000..43251cd01 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_start_stop_daemon.md @@ -0,0 +1,70 @@ +Managing the glusterd Service +============================= + +After installing GlusterFS, you must start glusterd service. The +glusterd service serves as the Gluster elastic volume manager, +overseeing glusterfs processes, and co-ordinating dynamic volume +operations, such as adding and removing volumes across multiple storage +servers non-disruptively. + +This section describes how to start the glusterd service in the +following ways: + +- ? + +- ? + +> **Note** +> +> You must start glusterd on all GlusterFS servers. + +Starting and Stopping glusterd Manually +======================================= + +This section describes how to start and stop glusterd manually + +- To start glusterd manually, enter the following command: + + `# /etc/init.d/glusterd start ` + +- To stop glusterd manually, enter the following command: + + `# /etc/init.d/glusterd stop` + +Starting glusterd Automatically +=============================== + +This section describes how to configure the system to automatically +start the glusterd service every time the system boots. + +To automatically start the glusterd service every time the system boots, +enter the following from the command line: + +`# chkconfig glusterd on ` + +Red Hat-based Systems +--------------------- + +To configure Red Hat-based systems to automatically start the glusterd +service every time the system boots, enter the following from the +command line: + +`# chkconfig glusterd on ` + +Debian-based Systems +-------------------- + +To configure Debian-based systems to automatically start the glusterd +service every time the system boots, enter the following from the +command line: + +`# update-rc.d glusterd defaults` + +Systems Other than Red Hat and Debain +------------------------------------- + +To configure systems other than Red Hat or Debian to automatically start +the glusterd service every time the system boots, enter the following +entry to the*/etc/rc.local* file: + +`# echo "glusterd" >> /etc/rc.local ` diff --git a/doc/admin-guide/en-US/markdown/admin_storage_pools.md b/doc/admin-guide/en-US/markdown/admin_storage_pools.md new file mode 100644 index 000000000..2a35cbea5 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_storage_pools.md @@ -0,0 +1,73 @@ +Setting up Trusted Storage Pools +================================ + +Before you can configure a GlusterFS volume, you must create a trusted +storage pool consisting of the storage servers that provides bricks to a +volume. + +A storage pool is a trusted network of storage servers. When you start +the first server, the storage pool consists of that server alone. To add +additional storage servers to the storage pool, you can use the probe +command from a storage server that is already trusted. + +> **Note** +> +> Do not self-probe the first server/localhost. + +The GlusterFS service must be running on all storage servers that you +want to add to the storage pool. See ? for more information. + +Adding Servers to Trusted Storage Pool +====================================== + +To create a trusted storage pool, add servers to the trusted storage +pool + +1. The hostnames used to create the storage pool must be resolvable by + DNS. + + To add a server to the storage pool: + + `# gluster peer probe ` + + For example, to create a trusted storage pool of four servers, add + three servers to the storage pool from server1: + + # gluster peer probe server2 + Probe successful + + # gluster peer probe server3 + Probe successful + + # gluster peer probe server4 + Probe successful + +2. Verify the peer status from the first server using the following + commands: + + # gluster peer status + Number of Peers: 3 + + Hostname: server2 + Uuid: 5e987bda-16dd-43c2-835b-08b7d55e94e5 + State: Peer in Cluster (Connected) + + Hostname: server3 + Uuid: 1e0ca3aa-9ef7-4f66-8f15-cbc348f29ff7 + State: Peer in Cluster (Connected) + + Hostname: server4 + Uuid: 3e0caba-9df7-4f66-8e5d-cbc348f29ff7 + State: Peer in Cluster (Connected) + +Removing Servers from the Trusted Storage Pool +============================================== + +To remove a server from the storage pool: + +`# gluster peer detach` + +For example, to remove server4 from the trusted storage pool: + + # gluster peer detach server4 + Detach successful diff --git a/doc/admin-guide/en-US/markdown/admin_troubleshooting.md b/doc/admin-guide/en-US/markdown/admin_troubleshooting.md new file mode 100644 index 000000000..88fb85c24 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/admin_troubleshooting.md @@ -0,0 +1,543 @@ +Troubleshooting GlusterFS +========================= + +This section describes how to manage GlusterFS logs and most common +troubleshooting scenarios related to GlusterFS. + +Managing GlusterFS Logs +======================= + +This section describes how to manage GlusterFS logs by performing the +following operation: + +- Rotating Logs + +Rotating Logs +------------- + +Administrators can rotate the log file in a volume, as needed. + +**To rotate a log file** + +- Rotate the log file using the following command: + + `# gluster volume log rotate ` + + For example, to rotate the log file on test-volume: + + # gluster volume log rotate test-volume + log rotate successful + + > **Note** + > + > When a log file is rotated, the contents of the current log file + > are moved to log-file- name.epoch-time-stamp. + +Troubleshooting Geo-replication +=============================== + +This section describes the most common troubleshooting scenarios related +to GlusterFS Geo-replication. + +Locating Log Files +------------------ + +For every Geo-replication session, the following three log files are +associated to it (four, if the slave is a gluster volume): + +- Master-log-file - log file for the process which monitors the Master + volume + +- Slave-log-file - log file for process which initiates the changes in + slave + +- Master-gluster-log-file - log file for the maintenance mount point + that Geo-replication module uses to monitor the master volume + +- Slave-gluster-log-file - is the slave's counterpart of it + +**Master Log File** + +To get the Master-log-file for geo-replication, use the following +command: + +`gluster volume geo-replication config log-file` + +For example: + +`# gluster volume geo-replication Volume1 example.com:/data/remote_dir config log-file ` + +**Slave Log File** + +To get the log file for Geo-replication on slave (glusterd must be +running on slave machine), use the following commands: + +1. On master, run the following command: + + `# gluster volume geo-replication Volume1 example.com:/data/remote_dir config session-owner 5f6e5200-756f-11e0-a1f0-0800200c9a66 ` + + Displays the session owner details. + +2. On slave, run the following command: + + `# gluster volume geo-replication /data/remote_dir config log-file /var/log/gluster/${session-owner}:remote-mirror.log ` + +3. Replace the session owner details (output of Step 1) to the output + of the Step 2 to get the location of the log file. + + `/var/log/gluster/5f6e5200-756f-11e0-a1f0-0800200c9a66:remote-mirror.log` + +Rotating Geo-replication Logs +----------------------------- + +Administrators can rotate the log file of a particular master-slave +session, as needed. When you run geo-replication's ` log-rotate` +command, the log file is backed up with the current timestamp suffixed +to the file name and signal is sent to gsyncd to start logging to a new +log file. + +**To rotate a geo-replication log file** + +- Rotate log file for a particular master-slave session using the + following command: + + `# gluster volume geo-replication log-rotate` + + For example, to rotate the log file of master `Volume1` and slave + `example.com:/data/remote_dir` : + + # gluster volume geo-replication Volume1 example.com:/data/remote_dir log rotate + log rotate successful + +- Rotate log file for all sessions for a master volume using the + following command: + + `# gluster volume geo-replication log-rotate` + + For example, to rotate the log file of master `Volume1`: + + # gluster volume geo-replication Volume1 log rotate + log rotate successful + +- Rotate log file for all sessions using the following command: + + `# gluster volume geo-replication log-rotate` + + For example, to rotate the log file for all sessions: + + # gluster volume geo-replication log rotate + log rotate successful + +Synchronization is not complete +------------------------------- + +**Description**: GlusterFS Geo-replication did not synchronize the data +completely but still the geo- replication status displayed is OK. + +**Solution**: You can enforce a full sync of the data by erasing the +index and restarting GlusterFS Geo- replication. After restarting, +GlusterFS Geo-replication begins synchronizing all the data. All files +are compared using checksum, which can be a lengthy and high resource +utilization operation on large data sets. If the error situation +persists, contact Red Hat Support. + +For more information about erasing index, see ?. + +Issues in Data Synchronization +------------------------------ + +**Description**: Geo-replication display status as OK, but the files do +not get synced, only directories and symlink gets synced with the +following error message in the log: + +[2011-05-02 13:42:13.467644] E [master:288:regjob] GMaster: failed to +sync ./some\_file\` + +**Solution**: Geo-replication invokes rsync v3.0.0 or higher on the host +and the remote machine. You must verify if you have installed the +required version. + +Geo-replication status displays Faulty very often +------------------------------------------------- + +**Description**: Geo-replication displays status as faulty very often +with a backtrace similar to the following: + +2011-04-28 14:06:18.378859] E [syncdutils:131:log\_raise\_exception] +\<top\>: FAIL: Traceback (most recent call last): File +"/usr/local/libexec/glusterfs/python/syncdaemon/syncdutils.py", line +152, in twraptf(\*aa) File +"/usr/local/libexec/glusterfs/python/syncdaemon/repce.py", line 118, in +listen rid, exc, res = recv(self.inf) File +"/usr/local/libexec/glusterfs/python/syncdaemon/repce.py", line 42, in +recv return pickle.load(inf) EOFError + +**Solution**: This error indicates that the RPC communication between +the master gsyncd module and slave gsyncd module is broken and this can +happen for various reasons. Check if it satisfies all the following +pre-requisites: + +- Password-less SSH is set up properly between the host and the remote + machine. + +- If FUSE is installed in the machine, because geo-replication module + mounts the GlusterFS volume using FUSE to sync data. + +- If the **Slave** is a volume, check if that volume is started. + +- If the Slave is a plain directory, verify if the directory has been + created already with the required permissions. + +- If GlusterFS 3.2 or higher is not installed in the default location + (in Master) and has been prefixed to be installed in a custom + location, configure the `gluster-command` for it to point to the + exact location. + +- If GlusterFS 3.2 or higher is not installed in the default location + (in slave) and has been prefixed to be installed in a custom + location, configure the `remote-gsyncd-command` for it to point to + the exact place where gsyncd is located. + +Intermediate Master goes to Faulty State +---------------------------------------- + +**Description**: In a cascading set-up, the intermediate master goes to +faulty state with the following log: + +raise RuntimeError ("aborting on uuid change from %s to %s" % \\ +RuntimeError: aborting on uuid change from af07e07c-427f-4586-ab9f- +4bf7d299be81 to de6b5040-8f4e-4575-8831-c4f55bd41154 + +**Solution**: In a cascading set-up the Intermediate master is loyal to +the original primary master. The above log means that the +geo-replication module has detected change in primary master. If this is +the desired behavior, delete the config option volume-id in the session +initiated from the intermediate master. + +Troubleshooting POSIX ACLs +========================== + +This section describes the most common troubleshooting issues related to +POSIX ACLs. + +setfacl command fails with “setfacl: \<file or directory name\>: Operation not supported” error +----------------------------------------------------------------------------------------------- + +You may face this error when the backend file systems in one of the +servers is not mounted with the "-o acl" option. The same can be +confirmed by viewing the following error message in the log file of the +server "Posix access control list is not supported". + +**Solution**: Remount the backend file system with "-o acl" option. For +more information, see ?. + +Troubleshooting Hadoop Compatible Storage +========================================= + +This section describes the most common troubleshooting issues related to +Hadoop Compatible Storage. + +Time Sync +--------- + +Running MapReduce job may throw exceptions if the time is out-of-sync on +the hosts in the cluster. + +**Solution**: Sync the time on all hosts using ntpd program. + +Troubleshooting NFS +=================== + +This section describes the most common troubleshooting issues related to +NFS . + +mount command on NFS client fails with “RPC Error: Program not registered” +-------------------------------------------------------------------------- + +Start portmap or rpcbind service on the NFS server. + +This error is encountered when the server has not started correctly. + +On most Linux distributions this is fixed by starting portmap: + +`$ /etc/init.d/portmap start` + +On some distributions where portmap has been replaced by rpcbind, the +following command is required: + +`$ /etc/init.d/rpcbind start ` + +After starting portmap or rpcbind, gluster NFS server needs to be +restarted. + +NFS server start-up fails with “Port is already in use” error in the log file." +------------------------------------------------------------------------------- + +Another Gluster NFS server is running on the same machine. + +This error can arise in case there is already a Gluster NFS server +running on the same machine. This situation can be confirmed from the +log file, if the following error lines exist: + + [2010-05-26 23:40:49] E [rpc-socket.c:126:rpcsvc_socket_listen] rpc-socket: binding socket failed:Address already in use + [2010-05-26 23:40:49] E [rpc-socket.c:129:rpcsvc_socket_listen] rpc-socket: Port is already in use + [2010-05-26 23:40:49] E [rpcsvc.c:2636:rpcsvc_stage_program_register] rpc-service: could not create listening connection + [2010-05-26 23:40:49] E [rpcsvc.c:2675:rpcsvc_program_register] rpc-service: stage registration of program failed + [2010-05-26 23:40:49] E [rpcsvc.c:2695:rpcsvc_program_register] rpc-service: Program registration failed: MOUNT3, Num: 100005, Ver: 3, Port: 38465 + [2010-05-26 23:40:49] E [nfs.c:125:nfs_init_versions] nfs: Program init failed + [2010-05-26 23:40:49] C [nfs.c:531:notify] nfs: Failed to initialize protocols + +To resolve this error one of the Gluster NFS servers will have to be +shutdown. At this time, Gluster NFS server does not support running +multiple NFS servers on the same machine. + +mount command fails with “rpc.statd” related error message +---------------------------------------------------------- + +If the mount command fails with the following error message: + +mount.nfs: rpc.statd is not running but is required for remote locking. +mount.nfs: Either use '-o nolock' to keep locks local, or start statd. + +Start rpc.statd + +For NFS clients to mount the NFS server, rpc.statd service must be +running on the clients. + +Start rpc.statd service by running the following command: + +`$ rpc.statd ` + +mount command takes too long to finish. +--------------------------------------- + +Start rpcbind service on the NFS client. + +The problem is that the rpcbind or portmap service is not running on the +NFS client. The resolution for this is to start either of these services +by running the following command: + +`$ /etc/init.d/portmap start` + +On some distributions where portmap has been replaced by rpcbind, the +following command is required: + +`$ /etc/init.d/rpcbind start` + +NFS server glusterfsd starts but initialization fails with “nfsrpc- service: portmap registration of program failed” error message in the log. +---------------------------------------------------------------------------------------------------------------------------------------------- + +NFS start-up can succeed but the initialization of the NFS service can +still fail preventing clients from accessing the mount points. Such a +situation can be confirmed from the following error messages in the log +file: + + [2010-05-26 23:33:47] E [rpcsvc.c:2598:rpcsvc_program_register_portmap] rpc-service: Could notregister with portmap + [2010-05-26 23:33:47] E [rpcsvc.c:2682:rpcsvc_program_register] rpc-service: portmap registration of program failed + [2010-05-26 23:33:47] E [rpcsvc.c:2695:rpcsvc_program_register] rpc-service: Program registration failed: MOUNT3, Num: 100005, Ver: 3, Port: 38465 + [2010-05-26 23:33:47] E [nfs.c:125:nfs_init_versions] nfs: Program init failed + [2010-05-26 23:33:47] C [nfs.c:531:notify] nfs: Failed to initialize protocols + [2010-05-26 23:33:49] E [rpcsvc.c:2614:rpcsvc_program_unregister_portmap] rpc-service: Could not unregister with portmap + [2010-05-26 23:33:49] E [rpcsvc.c:2731:rpcsvc_program_unregister] rpc-service: portmap unregistration of program failed + [2010-05-26 23:33:49] E [rpcsvc.c:2744:rpcsvc_program_unregister] rpc-service: Program unregistration failed: MOUNT3, Num: 100005, Ver: 3, Port: 38465 + +1. Start portmap or rpcbind service on the NFS server. + + On most Linux distributions, portmap can be started using the + following command: + + `$ /etc/init.d/portmap start ` + + On some distributions where portmap has been replaced by rpcbind, + run the following command: + + `$ /etc/init.d/rpcbind start ` + + After starting portmap or rpcbind, gluster NFS server needs to be + restarted. + +2. Stop another NFS server running on the same machine. + + Such an error is also seen when there is another NFS server running + on the same machine but it is not the Gluster NFS server. On Linux + systems, this could be the kernel NFS server. Resolution involves + stopping the other NFS server or not running the Gluster NFS server + on the machine. Before stopping the kernel NFS server, ensure that + no critical service depends on access to that NFS server's exports. + + On Linux, kernel NFS servers can be stopped by using either of the + following commands depending on the distribution in use: + + `$ /etc/init.d/nfs-kernel-server stop` + + `$ /etc/init.d/nfs stop` + +3. Restart Gluster NFS server. + +mount command fails with NFS server failed error. +------------------------------------------------- + +mount command fails with following error + +*mount: mount to NFS server '10.1.10.11' failed: timed out (retrying).* + +Perform one of the following to resolve this issue: + +1. Disable name lookup requests from NFS server to a DNS server. + + The NFS server attempts to authenticate NFS clients by performing a + reverse DNS lookup to match hostnames in the volume file with the + client IP addresses. There can be a situation where the NFS server + either is not able to connect to the DNS server or the DNS server is + taking too long to responsd to DNS request. These delays can result + in delayed replies from the NFS server to the NFS client resulting + in the timeout error seen above. + + NFS server provides a work-around that disables DNS requests, + instead relying only on the client IP addresses for authentication. + The following option can be added for successful mounting in such + situations: + + `option rpc-auth.addr.namelookup off ` + + > **Note** + > + > Note: Remember that disabling the NFS server forces authentication + > of clients to use only IP addresses and if the authentication + > rules in the volume file use hostnames, those authentication rules + > will fail and disallow mounting for those clients. + + or + +2. NFS version used by the NFS client is other than version 3. + + Gluster NFS server supports version 3 of NFS protocol. In recent + Linux kernels, the default NFS version has been changed from 3 to 4. + It is possible that the client machine is unable to connect to the + Gluster NFS server because it is using version 4 messages which are + not understood by Gluster NFS server. The timeout can be resolved by + forcing the NFS client to use version 3. The **vers** option to + mount command is used for this purpose: + + `$ mount -o vers=3 ` + +showmount fails with clnt\_create: RPC: Unable to receive +--------------------------------------------------------- + +Check your firewall setting to open ports 111 for portmap +requests/replies and Gluster NFS server requests/replies. Gluster NFS +server operates over the following port numbers: 38465, 38466, and +38467. + +For more information, see ?. + +Application fails with "Invalid argument" or "Value too large for defined data type" error. +------------------------------------------------------------------------------------------- + +These two errors generally happen for 32-bit nfs clients or applications +that do not support 64-bit inode numbers or large files. Use the +following option from the CLI to make Gluster NFS return 32-bit inode +numbers instead: nfs.enable-ino32 \<on|off\> + +Applications that will benefit are those that were either: + +- built 32-bit and run on 32-bit machines such that they do not + support large files by default + +- built 32-bit on 64-bit systems + +This option is disabled by default so NFS returns 64-bit inode numbers +by default. + +Applications which can be rebuilt from source are recommended to rebuild +using the following flag with gcc: + +` -D_FILE_OFFSET_BITS=64` + +Troubleshooting File Locks +========================== + +In GlusterFS 3.3 you can use `statedump` command to list the locks held +on files. The statedump output also provides information on each lock +with its range, basename, PID of the application holding the lock, and +so on. You can analyze the output to know about the locks whose +owner/application is no longer running or interested in that lock. After +ensuring that the no application is using the file, you can clear the +lock using the following `clear lock` command: + +`# ` + +For more information on performing `statedump`, see ? + +**To identify locked file and clear locks** + +1. Perform statedump on the volume to view the files that are locked + using the following command: + + `# gluster volume statedump inode` + + For example, to display statedump of test-volume: + + # gluster volume statedump test-volume + Volume statedump successful + + The statedump files are created on the brick servers in the` /tmp` + directory or in the directory set using `server.statedump-path` + volume option. The naming convention of the dump file is + `<brick-path>.<brick-pid>.dump`. + + The following are the sample contents of the statedump file. It + indicates that GlusterFS has entered into a state where there is an + entry lock (entrylk) and an inode lock (inodelk). Ensure that those + are stale locks and no resources own them. + + [xlator.features.locks.vol-locks.inode] + path=/ + mandatory=0 + entrylk-count=1 + lock-dump.domain.domain=vol-replicate-0 + xlator.feature.locks.lock-dump.domain.entrylk.entrylk[0](ACTIVE)=type=ENTRYLK_WRLCK on basename=file1, pid = 714782904, owner=ffffff2a3c7f0000, transport=0x20e0670, , granted at Mon Feb 27 16:01:01 2012 + + conn.2.bound_xl./gfs/brick1.hashsize=14057 + conn.2.bound_xl./gfs/brick1.name=/gfs/brick1/inode + conn.2.bound_xl./gfs/brick1.lru_limit=16384 + conn.2.bound_xl./gfs/brick1.active_size=2 + conn.2.bound_xl./gfs/brick1.lru_size=0 + conn.2.bound_xl./gfs/brick1.purge_size=0 + + [conn.2.bound_xl./gfs/brick1.active.1] + gfid=538a3d4a-01b0-4d03-9dc9-843cd8704d07 + nlookup=1 + ref=2 + ia_type=1 + [xlator.features.locks.vol-locks.inode] + path=/file1 + mandatory=0 + inodelk-count=1 + lock-dump.domain.domain=vol-replicate-0 + inodelk.inodelk[0](ACTIVE)=type=WRITE, whence=0, start=0, len=0, pid = 714787072, owner=00ffff2a3c7f0000, transport=0x20e0670, , granted at Mon Feb 27 16:01:01 2012 + +2. Clear the lock using the following command: + + `# ` + + For example, to clear the entry lock on `file1` of test-volume: + + # gluster volume clear-locks test-volume / kind granted entry file1 + Volume clear-locks successful + vol-locks: entry blocked locks=0 granted locks=1 + +3. Clear the inode lock using the following command: + + `# ` + + For example, to clear the inode lock on `file1` of test-volume: + + # gluster volume clear-locks test-volume /file1 kind granted inode 0,0-0 + Volume clear-locks successful + vol-locks: inode blocked locks=0 granted locks=1 + + You can perform statedump on test-volume again to verify that the + above inode and entry locks are cleared. + + diff --git a/doc/admin-guide/en-US/markdown/gfs_introduction.md b/doc/admin-guide/en-US/markdown/gfs_introduction.md new file mode 100644 index 000000000..fd2c53dc9 --- /dev/null +++ b/doc/admin-guide/en-US/markdown/gfs_introduction.md @@ -0,0 +1,50 @@ +Introducing Gluster File System +=============================== + +GlusterFS is an open source, clustered file system capable of scaling to +several petabytes and handling thousands of clients. GlusterFS can be +flexibly combined with commodity physical, virtual, and cloud resources +to deliver highly available and performant enterprise storage at a +fraction of the cost of traditional solutions. + +GlusterFS clusters together storage building blocks over Infiniband RDMA +and/or TCP/IP interconnect, aggregating disk and memory resources and +managing data in a single global namespace. GlusterFS is based on a +stackable user space design, delivering exceptional performance for +diverse workloads. + +![ Virtualized Cloud Environments ][] + +GlusterFS is designed for today's high-performance, virtualized cloud +environments. Unlike traditional data centers, cloud environments +require multi-tenancy along with the ability to grow or shrink resources +on demand. Enterprises can scale capacity, performance, and availability +on demand, with no vendor lock-in, across on-premise, public cloud, and +hybrid environments. + +GlusterFS is in production at thousands of enterprises spanning media, +healthcare, government, education, web 2.0, and financial services. The +following table lists the commercial offerings and its documentation +location: + + ------------------------------------------------------------------------ + Product Documentation Location + ----------- ------------------------------------------------------------ + Red Hat [][] + Storage + Software + Appliance + + Red Hat [][1] + Virtual + Storage + Appliance + + Red Hat [][2] + Storage + ------------------------------------------------------------------------ + + [ Virtualized Cloud Environments ]: images/640px-GlusterFS_Architecture.png + []: http://docs.redhat.com/docs/en-US/Red_Hat_Storage_Software_Appliance/index.html + [1]: http://docs.redhat.com/docs/en-US/Red_Hat_Virtual_Storage_Appliance/index.html + [2]: http://docs.redhat.com/docs/en-US/Red_Hat_Storage/index.html diff --git a/doc/admin-guide/en-US/markdown/glossary.md b/doc/admin-guide/en-US/markdown/glossary.md new file mode 100644 index 000000000..0febaff8f --- /dev/null +++ b/doc/admin-guide/en-US/markdown/glossary.md @@ -0,0 +1,134 @@ +Glossary +======== + +Brick +: A Brick is the GlusterFS basic unit of storage, represented by an + export directory on a server in the trusted storage pool. A Brick is + expressed by combining a server with an export directory in the + following format: + + `SERVER:EXPORT` + + For example: + + `myhostname:/exports/myexportdir/` + +Cluster +: A cluster is a group of linked computers, working together closely + thus in many respects forming a single computer. + +Distributed File System +: A file system that allows multiple clients to concurrently access + data over a computer network. + +Filesystem +: A method of storing and organizing computer files and their data. + Essentially, it organizes these files into a database for the + storage, organization, manipulation, and retrieval by the computer's + operating system. + + Source: [Wikipedia][] + +FUSE +: Filesystem in Userspace (FUSE) is a loadable kernel module for + Unix-like computer operating systems that lets non-privileged users + create their own file systems without editing kernel code. This is + achieved by running file system code in user space while the FUSE + module provides only a "bridge" to the actual kernel interfaces. + + Source: [Wikipedia][1] + +Geo-Replication +: Geo-replication provides a continuous, asynchronous, and incremental + replication service from site to another over Local Area Networks + (LAN), Wide Area Network (WAN), and across the Internet. + +glusterd +: The Gluster management daemon that needs to run on all servers in + the trusted storage pool. + +Metadata +: Metadata is data providing information about one or more other + pieces of data. + +Namespace +: Namespace is an abstract container or environment created to hold a + logical grouping of unique identifiers or symbols. Each Gluster + volume exposes a single namespace as a POSIX mount point that + contains every file in the cluster. + +Open Source +: Open source describes practices in production and development that + promote access to the end product's source materials. Some consider + open source a philosophy, others consider it a pragmatic + methodology. + + Before the term open source became widely adopted, developers and + producers used a variety of phrases to describe the concept; open + source gained hold with the rise of the Internet, and the attendant + need for massive retooling of the computing source code. + + Opening the source code enabled a self-enhancing diversity of + production models, communication paths, and interactive communities. + Subsequently, a new, three-word phrase "open source software" was + born to describe the environment that the new copyright, licensing, + domain, and consumer issues created. + + Source: [Wikipedia][2] + +Petabyte +: A petabyte (derived from the SI prefix peta- ) is a unit of + information equal to one quadrillion (short scale) bytes, or 1000 + terabytes. The unit symbol for the petabyte is PB. The prefix peta- + (P) indicates a power of 1000: + + 1 PB = 1,000,000,000,000,000 B = 10005 B = 1015 B. + + The term "pebibyte" (PiB), using a binary prefix, is used for the + corresponding power of 1024. + + Source: [Wikipedia][3] + +POSIX +: Portable Operating System Interface (for Unix) is the name of a + family of related standards specified by the IEEE to define the + application programming interface (API), along with shell and + utilities interfaces for software compatible with variants of the + Unix operating system. Gluster exports a fully POSIX compliant file + system. + +RAID +: Redundant Array of Inexpensive Disks (RAID) is a technology that + provides increased storage reliability through redundancy, combining + multiple low-cost, less-reliable disk drives components into a + logical unit where all drives in the array are interdependent. + +RRDNS +: Round Robin Domain Name Service (RRDNS) is a method to distribute + load across application servers. RRDNS is implemented by creating + multiple A records with the same name and different IP addresses in + the zone file of a DNS server. + +Trusted Storage Pool +: A storage pool is a trusted network of storage servers. When you + start the first server, the storage pool consists of that server + alone. + +Userspace +: Applications running in user space don’t directly interact with + hardware, instead using the kernel to moderate access. Userspace + applications are generally more portable than applications in kernel + space. Gluster is a user space application. + +Volfile +: Volfile is a configuration file used by glusterfs process. Volfile + will be usually located at `/var/lib/glusterd/vols/VOLNAME`. + +Volume +: A volume is a logical collection of bricks. Most of the gluster + management operations happen on the volume. + + [Wikipedia]: http://en.wikipedia.org/wiki/Filesystem + [1]: http://en.wikipedia.org/wiki/Filesystem_in_Userspace + [2]: http://en.wikipedia.org/wiki/Open_source + [3]: http://en.wikipedia.org/wiki/Petabyte diff --git a/doc/authentication.txt b/doc/authentication.txt index 70aafd933..73cb21d73 100644 --- a/doc/authentication.txt +++ b/doc/authentication.txt @@ -48,7 +48,7 @@ protocol/client: option remote-subvolume foo-brick * Client is connecting from a.b.c.d - + protocol/server: option auth.addr.foo-brick.allow a.b.c.d,e.f.g.h,i.j.k.l #, other ip addresses from which clients are allowed to connect to foo-brick @@ -79,19 +79,19 @@ * reject only "user shoo from a.b.c.d" protcol/client: option remote-subvolume shoo-brick - + protocol/server: # observe that no "option auth.login.shoo-brick.allow shoo" given # Also other users from a.b.c.d have to be explicitly allowed using auth.login.shoo-brick.allow ... option auth.addr.shoo-brick.allow !a.b.c.d - * reject only "user shoo" from a.b.c.d i.e., user shoo from a.b.c.d has to be rejected. + * reject only "user shoo" from a.b.c.d i.e., user shoo from a.b.c.d has to be rejected. * same as reject only "user shoo from a.b.c.d" above, but rules have to be added whether to allow ip addresses (and users from those ips) other than a.b.c.d **************************************************************************************************** * ip or username/password based authentication - + * allow user foo or clients from a.b.c.d protocol/client: option remote-subvolume foo-brick @@ -104,7 +104,7 @@ * reject user shoo or clients from a.b.c.d protocol/client: option remote-subvolume shoo-brick - + protocol/server: option auth.login.shoo-brick.allow <usernames other than shoo> #for each username mentioned in the above <usernames other than shoo> list, specify password as below diff --git a/doc/coding-standard.tex b/doc/coding-standard.tex index 92f799c01..30d412a91 100644 --- a/doc/coding-standard.tex +++ b/doc/coding-standard.tex @@ -27,7 +27,7 @@ purpose. The comment should be descriptive without being overly verbose. \textsl{Good}: \begin{verbatim} - DBTYPE access_mode; /* access mode for accessing + DBTYPE access_mode; /* access mode for accessing * the databases, can be * DB_HASH, DB_BTREE * (option access-mode <mode>) @@ -69,7 +69,7 @@ Never use a non-constant expression as the initialization value for a variable. \section*{$\bullet$ Validate all arguments to a function} All pointer arguments to a function must be checked for \texttt{NULL}. -A macro named \texttt{VALIDATE} (in \texttt{common-utils.h}) +A macro named \texttt{VALIDATE} (in \texttt{common-utils.h}) takes one argument, and if it is \texttt{NULL}, writes a log message and jumps to a label called \texttt{err} after setting op\_ret and op\_errno appropriately. It is recommended to use this template. @@ -142,8 +142,8 @@ for success or failure. \begin{verbatim} op_ret = close (_fd); if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "close on file %s failed (%s)", real_path, + gf_log (this->name, GF_LOG_ERROR, + "close on file %s failed (%s)", real_path, strerror (errno)); op_errno = errno; goto out; @@ -157,9 +157,9 @@ memory allocation fails, the call should be unwound and an error returned to the user. \section*{$\bullet$ Use result args and reserve the return value to indicate success or failure} -The return value of every functions must indicate success or failure (unless -it is impossible for the function to fail --- e.g., boolean functions). If -the function needs to return additional data, it must be returned using a +The return value of every functions must indicate success or failure (unless +it is impossible for the function to fail --- e.g., boolean functions). If +the function needs to return additional data, it must be returned using a result (pointer) argument. \vspace{2ex} @@ -192,11 +192,11 @@ Unless impossible, use the length-limited versions of the string functions. \end{verbatim} \section*{$\bullet$ No dead or commented code} -There must be no dead code (code to which control can never be passed) or +There must be no dead code (code to which control can never be passed) or commented out code in the codebase. \section*{$\bullet$ Only one unwind and return per function} -There must be only one exit out of a function. \texttt{UNWIND} and return +There must be only one exit out of a function. \texttt{UNWIND} and return should happen at only point in the function. \section*{$\bullet$ Function length or Keep functions small} @@ -305,7 +305,7 @@ documentation. \end{verbatim} \subsection*{Indicating critical sections} -To clearly show regions of code which execute with locks held, use +To clearly show regions of code which execute with locks held, use the following format: \begin{verbatim} @@ -324,7 +324,7 @@ point, \texttt{out}. At that point, the code should detect the error that has occured and do appropriate cleanup. \begin{verbatim} -int32_t +int32_t sample_fop (call_frame_t *frame, xlator_t *this, ...) { char * var1 = NULL; @@ -337,13 +337,13 @@ sample_fop (call_frame_t *frame, xlator_t *this, ...) VALIDATE_OR_GOTO (this, out); /* other validations */ - + dir = opendir (...); if (dir == NULL) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "opendir failed on %s (%s)", loc->path, + gf_log (this->name, GF_LOG_ERROR, + "opendir failed on %s (%s)", loc->path, strerror (op_errno)); goto out; } @@ -367,11 +367,10 @@ sample_fop (call_frame_t *frame, xlator_t *this, ...) if (dir) { closedir (dir); dir = NULL; - } - + } + if (pfd) { - if (pfd->path) - FREE (pfd->path); + FREE (pfd->path); FREE (pfd); pfd = NULL; } diff --git a/doc/examples/Makefile.am b/doc/examples/Makefile.am deleted file mode 100644 index b4c93f4c9..000000000 --- a/doc/examples/Makefile.am +++ /dev/null @@ -1,8 +0,0 @@ -EXTRA = README unify.vol replicate.vol stripe.vol protocol-client.vol protocol-server.vol posix-locks.vol trash.vol write-behind.vol io-threads.vol io-cache.vol read-ahead.vol filter.vol trace.vol -EXTRA_DIST = $(EXTRA) - -docdir = $(datadir)/doc/$(PACKAGE_NAME) -Examplesdir = $(docdir)/examples -Examples_DATA = $(EXTRA) - -CLEANFILES = diff --git a/doc/examples/README b/doc/examples/README deleted file mode 100644 index 4d472ac08..000000000 --- a/doc/examples/README +++ /dev/null @@ -1,13 +0,0 @@ -GlusterFS's translator feature is very flexible and there are quite a lot of ways one -can configure their filesystem to behave like. - -Volume Specification is a way in which GlusterFS understands how it has to work, based -on what is written there. - -Going through the following URLs may give you more idea about all these. - -* http://www.gluster.org/docs/index.php/GlusterFS -* http://www.gluster.org/docs/index.php/GlusterFS_Volume_Specification -* http://www.gluster.org/docs/index.php/GlusterFS_Translators - -Mail us any doubts, suggestions on 'gluster-devel(at)nongnu.org' diff --git a/doc/examples/filter.vol b/doc/examples/filter.vol deleted file mode 100644 index ca5c59837..000000000 --- a/doc/examples/filter.vol +++ /dev/null @@ -1,23 +0,0 @@ -volume client - type protocol/client - option transport-type tcp # for TCP/IP transport - option remote-host 192.168.1.10 # IP address of the remote brick - option remote-subvolume brick # name of the remote volume -end-volume - -## In normal clustered storage type, any of the cluster translators can come here. -# -# Definition of other clients -# -# Definition of cluster translator (may be unify, afr, or unify over afr) -# - -### 'Filter' translator is used on client side (or server side according to needs). This traslator makes all the below translators, (or say volumes) as read-only. Hence if one wants a 'read-only' filesystem, using filter as the top most volume will make it really fast as the fops are returned from this level itself. - -volume filter-ro - type features/filter - option root-squashing enable -# option completely-read-only yes -# translate-uid 1-99=0 - subvolumes client -end-volume
\ No newline at end of file diff --git a/doc/examples/io-cache.vol b/doc/examples/io-cache.vol deleted file mode 100644 index 5f3eca4c5..000000000 --- a/doc/examples/io-cache.vol +++ /dev/null @@ -1,25 +0,0 @@ -volume client - type protocol/client - option transport-type tcp # for TCP/IP transport - option remote-host 192.168.1.10 # IP address of the remote brick - option remote-subvolume brick # name of the remote volume -end-volume - -## In normal clustered storage type, any of the cluster translators can come here. -# -# Definition of other clients -# -# Definition of cluster translator (may be unify, replicate, or unify over replicate) -# - -### 'IO-Cache' translator is best used on client side when a filesystem has file which are not modified frequently but read several times. For example, while compiling a kernel, *.h files are read while compiling every *.c file, in these case, io-cache translator comes very handy, as it keeps the whole file content in the cache, and serves from the cache. -# One can provide the priority of the cache too. - -volume ioc - type performance/io-cache - subvolumes client # In this example it is 'client' you may have to change it according to your spec file. - option page-size 1MB # 128KB is default - option cache-size 64MB # 32MB is default - option force-revalidate-timeout 5 # 1second is default - option priority *.html:2,*:1 # default is *:0 -end-volume diff --git a/doc/examples/io-threads.vol b/doc/examples/io-threads.vol deleted file mode 100644 index 9954724e1..000000000 --- a/doc/examples/io-threads.vol +++ /dev/null @@ -1,21 +0,0 @@ - -volume brick - type storage/posix # POSIX FS translator - option directory /home/export # Export this directory -end-volume - -### 'IO-threads' translator gives a threading behaviour to File I/O calls. All other normal fops are having default behaviour. Loading this on server side helps to reduce the contension of network. (Which is assumed as a GlusterFS hang). -# One can load it in client side to reduce the latency involved in case of a slow network, when loaded below write-behind. -volume iot - type performance/io-threads - subvolumes brick - option thread-count 4 # default value is 1 -end-volume - -volume server - type protocol/server - subvolumes iot brick - option transport-type tcp # For TCP/IP transport - option auth.addr.brick.allow 192.168.* # Allow access to "brick" volume - option auth.addr.iot.allow 192.168.* # Allow access to "p-locks" volume -end-volume diff --git a/doc/examples/posix-locks.vol b/doc/examples/posix-locks.vol deleted file mode 100644 index b9c9e7a64..000000000 --- a/doc/examples/posix-locks.vol +++ /dev/null @@ -1,20 +0,0 @@ - -volume brick - type storage/posix # POSIX FS translator - option directory /home/export # Export this directory -end-volume - -### 'Posix-locks' feature should be added on the server side (as posix volume as subvolume) because it contains the actual file. -volume p-locks - type features/posix-locks - subvolumes brick - option mandatory on -end-volume - -volume server - type protocol/server - subvolumes p-locks brick - option transport-type tcp - option auth.addr.brick.allow 192.168.* # Allow access to "brick" volume - option auth.addr.p-locks.allow 192.168.* # Allow access to "p-locks" volume -end-volume diff --git a/doc/examples/protocol-client.vol b/doc/examples/protocol-client.vol deleted file mode 100644 index 43c43e02d..000000000 --- a/doc/examples/protocol-client.vol +++ /dev/null @@ -1,17 +0,0 @@ -volume client - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport - option remote-host 192.168.1.10 # IP address of the remote brick -# option transport.socket.remote-port 24016 - -# option transport-type ib-verbs # for Infiniband verbs transport -# option transport.ib-verbs.work-request-send-size 1048576 -# option transport.ib-verbs.work-request-send-count 16 -# option transport.ib-verbs.work-request-recv-size 1048576 -# option transport.ib-verbs.work-request-recv-count 16 -# option transport.ib-verbs.remote-port 24016 - - option remote-subvolume brick # name of the remote volume -# option transport-timeout 30 # default value is 120seconds -end-volume diff --git a/doc/examples/protocol-server.vol b/doc/examples/protocol-server.vol deleted file mode 100644 index e8e4a4643..000000000 --- a/doc/examples/protocol-server.vol +++ /dev/null @@ -1,25 +0,0 @@ - -### Export volume "brick" with the contents of "/home/export" directory. -volume brick - type storage/posix # POSIX FS translator - option directory /home/export # Export this directory -end-volume - -### Add network serving capability to above brick. -volume server - type protocol/server - option transport-type tcp # For TCP/IP transport -# option transport.socket.listen-port 24016 - -# option transport-type ib-verbs # For Infiniband Verbs transport -# option transport.ib-verbs.work-request-send-size 131072 -# option transport.ib-verbs.work-request-send-count 64 -# option transport.ib-verbs.work-request-recv-size 131072 -# option transport.ib-verbs.work-request-recv-count 64 -# option transport.ib-verbs.listen-port 24016 - -# option bind-address 192.168.1.10 # Default is to listen on all interfaces -# option client-volume-filename /etc/glusterfs/glusterfs-client.vol - subvolumes brick - option auth.addr.brick.allow 192.168.* # Allow access to "brick" volume -end-volume diff --git a/doc/examples/read-ahead.vol b/doc/examples/read-ahead.vol deleted file mode 100644 index 3ce0d95ac..000000000 --- a/doc/examples/read-ahead.vol +++ /dev/null @@ -1,22 +0,0 @@ -volume client - type protocol/client - option transport-type tcp # for TCP/IP transport - option remote-host 192.168.1.10 # IP address of the remote brick - option remote-subvolume brick # name of the remote volume -end-volume - -## In normal clustered storage type, any of the cluster translators can come here. -# -# Definition of other clients -# -# Definition of cluster translator (may be unify, replicate, or unify over replicate) -# - -### 'Read-Ahead' translator is best utilized on client side, as it prefetches the file contents when the first read() call is issued. -volume ra - type performance/read-ahead - subvolumes client # In this example it is 'client' you may have to change it according to your spec file. - option page-size 1MB # default is 256KB - option page-count 4 # default is 2 - option force-atime-update no # defalut is 'no' -end-volume diff --git a/doc/examples/replicate.vol b/doc/examples/replicate.vol deleted file mode 100644 index 333ba7de1..000000000 --- a/doc/examples/replicate.vol +++ /dev/null @@ -1,119 +0,0 @@ -### 'NOTE' -# This file has both server spec and client spec to get an understanding of stripe's spec file. Hence can't be used as it is, as a GlusterFS spec file. -# One need to seperate out server spec and client spec to get it working. - -#========================================================================= - -# **** server1 spec file **** - -### Export volume "brick" with the contents of "/home/export" directory. -volume posix1 - type storage/posix # POSIX FS translator - option directory /home/export1 # Export this directory -end-volume - -### Add POSIX record locking support to the storage brick -volume brick1 - type features/posix-locks - option mandatory on # enables mandatory locking on all files - subvolumes posix1 -end-volume - -### Add network serving capability to above brick. -volume server - type protocol/server - option transport-type tcp # For TCP/IP transport - option transport.socket.listen-port 24016 -# option client-volume-filename /etc/glusterfs/glusterfs-client.vol - subvolumes brick1 - option auth.addr.brick1.allow * # access to "brick" volume -end-volume - - -#========================================================================= - -# **** server2 spec file **** -volume posix2 - type storage/posix # POSIX FS translator - option directory /home/export2 # Export this directory -end-volume - -### Add POSIX record locking support to the storage brick -volume brick2 - type features/posix-locks - option mandatory on # enables mandatory locking on all files - subvolumes posix2 -end-volume - -### Add network serving capability to above brick. -volume server - type protocol/server - option transport-type tcp # For TCP/IP transport - option transport.socket.listen-port 24017 - subvolumes brick2 - option auth.addr.brick2.allow * # Allow access to "brick" volume -end-volume - - -#========================================================================= - -# **** server3 spec file **** - -volume posix3 - type storage/posix # POSIX FS translator - option directory /home/export3 # Export this directory -end-volume - -### Add POSIX record locking support to the storage brick -volume brick3 - type features/posix-locks - option mandatory on # enables mandatory locking on all files - subvolumes posix3 -end-volume - -### Add network serving capability to above brick. -volume server - type protocol/server - option transport-type tcp # For TCP/IP transport - option transport.socket.listen-port 24018 - subvolumes brick3 - option auth.addr.brick3.allow * # access to "brick" volume -end-volume - - -#========================================================================= - -# **** Clustered Client config file **** - -### Add client feature and attach to remote subvolume of server1 -volume client1 - type protocol/client - option transport-type tcp # for TCP/IP transport - option remote-host 127.0.0.1 # IP address of the remote brick - option transport.socket.remote-port 24016 - option remote-subvolume brick1 # name of the remote volume -end-volume - -### Add client feature and attach to remote subvolume of server2 -volume client2 - type protocol/client - option transport-type tcp # for TCP/IP transport - option remote-host 127.0.0.1 # IP address of the remote brick - option transport.socket.remote-port 24017 - option remote-subvolume brick2 # name of the remote volume -end-volume - -volume client3 - type protocol/client - option transport-type tcp # for TCP/IP transport - option remote-host 127.0.0.1 # IP address of the remote brick - option transport.socket.remote-port 24018 - option remote-subvolume brick3 # name of the remote volume -end-volume - -## Add replicate feature. -volume replicate - type cluster/replicate - subvolumes client1 client2 client3 -end-volume - diff --git a/doc/examples/stripe.vol b/doc/examples/stripe.vol deleted file mode 100644 index 6055b66b9..000000000 --- a/doc/examples/stripe.vol +++ /dev/null @@ -1,121 +0,0 @@ - -### 'NOTE' -# This file has both server spec and client spec to get an understanding of stripe's spec file. Hence can't be used as it is, as a GlusterFS spec file. -# One need to seperate out server spec and client spec to get it working. - -#========================================================================= - -# **** server1 spec file **** - -### Export volume "brick" with the contents of "/home/export" directory. -volume posix1 - type storage/posix # POSIX FS translator - option directory /home/export1 # Export this directory -end-volume - -### Add POSIX record locking support to the storage brick -volume brick1 - type features/posix-locks - option mandatory on # enables mandatory locking on all files - subvolumes posix1 -end-volume - -### Add network serving capability to above brick. -volume server - type protocol/server - option transport-type tcp # For TCP/IP transport - option transport.socket.listen-port 24016 -# option client-volume-filename /etc/glusterfs/glusterfs-client.vol - subvolumes brick1 - option auth.addr.brick1.allow * # access to "brick" volume -end-volume - - -#========================================================================= - -# **** server2 spec file **** -volume posix2 - type storage/posix # POSIX FS translator - option directory /home/export2 # Export this directory -end-volume - -### Add POSIX record locking support to the storage brick -volume brick2 - type features/posix-locks - option mandatory on # enables mandatory locking on all files - subvolumes posix2 -end-volume - -### Add network serving capability to above brick. -volume server - type protocol/server - option transport-type tcp # For TCP/IP transport - option transport.socket.listen-port 24017 - subvolumes brick2 - option auth.addr.brick2.allow * # Allow access to "brick" volume -end-volume - - -#========================================================================= - -# **** server3 spec file **** - -volume posix3 - type storage/posix # POSIX FS translator - option directory /home/export3 # Export this directory -end-volume - -### Add POSIX record locking support to the storage brick -volume brick3 - type features/posix-locks - option mandatory on # enables mandatory locking on all files - subvolumes posix3 -end-volume - -### Add network serving capability to above brick. -volume server - type protocol/server - option transport-type tcp # For TCP/IP transport - option transport.socket.listen-port 24018 - subvolumes brick3 - option auth.addr.brick3.allow * # access to "brick" volume -end-volume - - -#========================================================================= - -# **** Clustered Client config file **** - -### Add client feature and attach to remote subvolume of server1 -volume client1 - type protocol/client - option transport-type tcp # for TCP/IP transport - option remote-host 127.0.0.1 # IP address of the remote brick - option transport.socket.remote-port 24016 - option remote-subvolume brick1 # name of the remote volume -end-volume - -### Add client feature and attach to remote subvolume of server2 -volume client2 - type protocol/client - option transport-type tcp # for TCP/IP transport - option remote-host 127.0.0.1 # IP address of the remote brick - option transport.socket.remote-port 24017 - option remote-subvolume brick2 # name of the remote volume -end-volume - -volume client3 - type protocol/client - option transport-type tcp # for TCP/IP transport - option remote-host 127.0.0.1 # IP address of the remote brick - option transport.socket.remote-port 24018 - option remote-subvolume brick3 # name of the remote volume -end-volume - -## Add Stripe Feature. -volume stripe - type cluster/stripe - subvolumes client1 client2 client3 - option block-size 1MB -end-volume - diff --git a/doc/examples/trace.vol b/doc/examples/trace.vol deleted file mode 100644 index 3f4864db4..000000000 --- a/doc/examples/trace.vol +++ /dev/null @@ -1,16 +0,0 @@ -volume client - type protocol/client - option transport-type tcp # for TCP/IP transport - option remote-host 192.168.1.10 # IP address of the remote brick - option remote-subvolume brick # name of the remote volume -end-volume - -### 'Trace' translator is a very handy debug tool for GlusterFS, as it can be loaded between any of the two volumes without changing the behaviour of the filesystem. -# On client side it can be the top most volume in spec (like now) to understand what calls are made on FUSE filesystem, when a mounted filesystem is accessed. - -volume trace - type debug/trace - subvolumes client -end-volume - -# 'NOTE:' By loading 'debug/trace' translator, filesystem will be very slow as it logs each and every calls to the log file. diff --git a/doc/examples/trash.vol b/doc/examples/trash.vol deleted file mode 100644 index 16e71be32..000000000 --- a/doc/examples/trash.vol +++ /dev/null @@ -1,20 +0,0 @@ - -volume brick - type storage/posix # POSIX FS translator - option directory /home/export # Export this directory -end-volume - -### 'Trash' translator is best used on server side as it just renames the deleted file inside 'trash-dir', and it makes 4 seperate fops for one unlink call. -volume trashcan - type features/trash - subvolumes brick - option trash-dir /.trashcan -end-volume - -volume server - type protocol/server - subvolumes trashcan brick - option transport-type tcp # For TCP/IP transport - option auth.addr.brick.allow 192.168.* # Allow access to "brick" volume - option auth.addr.trashcan.allow 192.168.* # Allow access to "p-locks" volume -end-volume diff --git a/doc/examples/unify.vol b/doc/examples/unify.vol deleted file mode 100644 index 3fb7e8320..000000000 --- a/doc/examples/unify.vol +++ /dev/null @@ -1,178 +0,0 @@ -### 'NOTE' -# This file has both server spec and client spec to get an understanding of stripe's spec file. Hence can't be used as it is, as a GlusterFS spec file. -# One need to seperate out server spec and client spec to get it working. - - -#========================================================================= - -# **** server1 spec file **** - -### Export volume "brick" with the contents of "/home/export" directory. -volume posix1 - type storage/posix # POSIX FS translator - option directory /home/export1 # Export this directory -end-volume - -### Add POSIX record locking support to the storage brick -volume brick1 - type features/posix-locks - option mandatory on # enables mandatory locking on all files - subvolumes posix1 -end-volume - -### Add network serving capability to above brick. -volume server - type protocol/server - option transport-type tcp # For TCP/IP transport - option transport.socket.listen-port 24016 -# option client-volume-filename /etc/glusterfs/glusterfs-client.vol - subvolumes brick1 - option auth.addr.brick1.allow * # access to "brick" volume -end-volume - - -#========================================================================= - -# **** server2 spec file **** -volume posix2 - type storage/posix # POSIX FS translator - option directory /home/export2 # Export this directory -end-volume - -### Add POSIX record locking support to the storage brick -volume brick2 - type features/posix-locks - option mandatory on # enables mandatory locking on all files - subvolumes posix2 -end-volume - -### Add network serving capability to above brick. -volume server - type protocol/server - option transport-type tcp # For TCP/IP transport - option transport.socket.listen-port 24017 - subvolumes brick2 - option auth.addr.brick2.allow * # Allow access to "brick" volume -end-volume - - -#========================================================================= - -# **** server3 spec file **** - -volume posix3 - type storage/posix # POSIX FS translator - option directory /home/export3 # Export this directory -end-volume - -### Add POSIX record locking support to the storage brick -volume brick3 - type features/posix-locks - option mandatory on # enables mandatory locking on all files - subvolumes posix3 -end-volume - -### Add network serving capability to above brick. -volume server - type protocol/server - option transport-type tcp # For TCP/IP transport - option transport.socket.listen-port 24018 - subvolumes brick3 - option auth.addr.brick3.allow * # access to "brick" volume -end-volume - -#========================================================================= - -# *** server for namespace *** -### Export volume "brick" with the contents of "/home/export" directory. -volume brick-ns - type storage/posix # POSIX FS translator - option directory /home/export-ns # Export this directory -end-volume - -volume server - type protocol/server - option transport-type tcp # For TCP/IP transport - option transport.socket.listen-port 24019 - subvolumes brick-ns - option auth.addr.brick-ns.allow * # access to "brick" volume -end-volume - - -#========================================================================= - -# **** Clustered Client config file **** - -### Add client feature and attach to remote subvolume of server1 -volume client1 - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport - option remote-host 127.0.0.1 # IP address of the remote brick - option transport.socket.remote-port 24016 - option remote-subvolume brick1 # name of the remote volume -end-volume - -### Add client feature and attach to remote subvolume of server2 -volume client2 - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport - option remote-host 127.0.0.1 # IP address of the remote brick - option transport.socket.remote-port 24017 - option remote-subvolume brick2 # name of the remote volume -end-volume - -volume client3 - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport - option remote-host 127.0.0.1 # IP address of the remote brick - option transport.socket.remote-port 24018 - option remote-subvolume brick3 # name of the remote volume -end-volume - - -volume client-ns - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport - option remote-host 127.0.0.1 # IP address of the remote brick - option transport.socket.remote-port 24019 - option remote-subvolume brick-ns # name of the remote volume -end-volume - -### Add unify feature to cluster the servers. Associate an -### appropriate scheduler that matches your I/O demand. -volume bricks - type cluster/unify - option namespace client-ns # this will not be storage child of unify. - subvolumes client1 client2 client3 -### ** ALU Scheduler Option ** - option self-heal background # foreground off # default is foreground - option scheduler alu - option alu.limits.min-free-disk 5% #% - option alu.limits.max-open-files 10000 - option alu.order disk-usage:read-usage:write-usage:open-files-usage:disk-speed-usage - option alu.disk-usage.entry-threshold 2GB - option alu.disk-usage.exit-threshold 128MB - option alu.open-files-usage.entry-threshold 1024 - option alu.open-files-usage.exit-threshold 32 - option alu.read-usage.entry-threshold 20 #% - option alu.read-usage.exit-threshold 4 #% - option alu.write-usage.entry-threshold 20 #% - option alu.write-usage.exit-threshold 4 #% - option alu.disk-speed-usage.entry-threshold 0 # DO NOT SET IT. SPEED IS CONSTANT!!!. - option alu.disk-speed-usage.exit-threshold 0 # DO NOT SET IT. SPEED IS CONSTANT!!!. - option alu.stat-refresh.interval 10sec - option alu.stat-refresh.num-file-create 10 -### ** Random Scheduler ** -# option scheduler random -### ** NUFA Scheduler ** -# option scheduler nufa -# option nufa.local-volume-name posix1 -### ** Round Robin (RR) Scheduler ** -# option scheduler rr -# option rr.limits.min-free-disk 5% #% -end-volume - diff --git a/doc/examples/write-behind.vol b/doc/examples/write-behind.vol deleted file mode 100644 index 9c6bae11c..000000000 --- a/doc/examples/write-behind.vol +++ /dev/null @@ -1,26 +0,0 @@ -volume client - type protocol/client - option transport-type tcp # for TCP/IP transport - option remote-host 192.168.1.10 # IP address of the remote brick - option remote-subvolume brick # name of the remote volume -end-volume - -## In normal clustered storage type, any of the cluster translators can come here. -# -# Definition of other clients -# -# Definition of cluster translator (may be unify, replicate, or unify over replicate) -# - - -### 'Write-behind' translator is a performance booster for write operation. Best used on client side, as its main intension is to reduce the network latency caused for each write operation. - -volume wb - type performance/write-behind - subvolumes client # In this example it is 'client' you may have to change it according to your spec file. - option flush-behind on # default value is 'off' - option window-size 2MB - option aggregate-size 1MB # default value is 0 - option enable_O_SYNC no # default is no - option disable-for-first-nbytes 128KB #default is 1 -end-volume diff --git a/doc/features/rdma-cm-in-3.4.0.txt b/doc/features/rdma-cm-in-3.4.0.txt new file mode 100644 index 000000000..fd953e56b --- /dev/null +++ b/doc/features/rdma-cm-in-3.4.0.txt @@ -0,0 +1,9 @@ +Following is the impact of http://review.gluster.org/#change,149. + +New userspace packages needed: +librdmacm +librdmacm-devel + +rdmacm needs an IPoIB address for connection establishment. This requirement results in following issues: +* Because of bug #890502, we've to probe the peer on an IPoIB address. This imposes a restriction that all volumes created in the future have to communicate over IPoIB address (irrespective of whether they use gluster's tcp or rdma transport). +* Currently client has an independence to choose b/w tcp and rdma transports while communicating with the server (by creating volumes with transport-type tcp,rdma). This independence was a byproduct of our ability use the normal channel used with transport-type tcp for rdma connectiion establishment handshake too. However, with new requirement of IPoIB address for connection establishment, we loose this independence (till we bring in multi-network support - where a brick can be identified by a set of ip-addresses and we can choose different pairs of ip-addresses for communication based on our requirements - in glusterd). diff --git a/doc/features/rebalance.md b/doc/features/rebalance.md new file mode 100644 index 000000000..29b993008 --- /dev/null +++ b/doc/features/rebalance.md @@ -0,0 +1,74 @@ +## Background + + +For a more detailed description, view Jeff Darcy's blog post [here] +(http://hekafs.org/index.php/2012/03/glusterfs-algorithms-distribution/) + +GlusterFS uses the distribute translator (DHT) to aggregate space of multiple servers. DHT distributes files among its subvolumes using a consistent hashing method providing 32-bit hashes. Each DHT subvolume is given a range in the 32-bit hash space. A hash value is calculated for every file using a combination of its name. The file is then placed in the subvolume with the hash range that contains the hash value. + +## What is rebalance? + +The rebalance process migrates files between the DHT subvolumes when necessary. + +## When is rebalance required? + +Rebalancing is required for two main cases. + +1. Addition/Removal of bricks + +2. Renaming of a file + +## Addition/Removal of bricks + +Whenever the number or order of DHT subvolumes change, the hash range given to each subvolume is recalculated. When this happens, already existing files on the volume will need to be moved to the correct subvolume based on their hash. Rebalance does this activity. + +Addition of bricks which increase the size of a volume will increase the number of DHT subvolumes and lead to recalculation of hash ranges (This doesn't happen when bricks are added to a volume to increase redundancy, i.e. increase replica count of a volume). This will require an explicit rebalance command to be issued to migrate the files. + +Removal of bricks which decrease the size of a volumes also causes the hash ranges of DHT to be recalculated. But we don't need to issue an explicit rebalance command in this case, as rebalance is done automatically by the remove-brick process if needed. + +## Renaming of a file + +Renaming of file will cause its hash to change. The file now needs to be moved to the correct subvolume based on its new hash. Rebalance does this. + +## How does rebalance work? + +At a high level, the rebalance process consists of the following 3 steps: + +1. Crawl the volume to access all files +2. Calculate the hash for the file +3. If needed move the migrate the file to the correct subvolume. + + +The rebalance process has been optimized by making it distributed across the trusted storage pool. With distributed rebalance, a rebalance process is launched on each peer in the cluster. Each rebalance process will crawl files on only those bricks of the volume which are present on it, and migrate the files which need migration to the correct brick. This speeds up the rebalance process considerably. + +## What will happen if rebalance is not run? + +### Addition of bricks + +With the current implementation of add-brick, when the size of a volume is augmented by adding new bricks, the new bricks are not put into use immediately i.e., the hash ranges there not recalculated immediately. This means that the files will still be placed only onto the existing bricks, leaving the newly added storage space unused. Starting a rebalance process on the volume will cause the hash ranges to be recalculated with the new bricks included, which allows the newly added storage space to be used. + +### Renaming a file + +When a file rename causes the file to be hashed to a new subvolume, DHT writes a link file on the new subvolume leaving the actual file on the original subvolume. A link file is an empty file, which has an extended attribute set that points to the subvolume on which the actual file exists. So, when a client accesses the renamed file, DHT first looks for the file in the hashed subvolume and gets the link file. DHT understands the link file, and gets the actual file from the subvolume pointed to by the link file. This leads to a slight reduction in performance. A rebalance will move the actual file to the hashed subvolume, allowing clients to access the file directly once again. + +## Are clients affected during a rebalance process? + +The rebalance process is transparent to applications on the clients. Applications which have open files on the volume will not be affected by the rebalance process, even if the open file requires migration. The DHT translator on the client will hide the migration from the applications. + +##How are open files migrated? + +(A more technical description of the algorithm used can be seen in the commit message of commit a07bb18c8adeb8597f62095c5d1361c5bad01f09.) + +To achieve migration of open files, two things need to be assured of, +a) any writes or changes happening to the file during migration are correctly synced to destination subvolume after the migration is complete. +b) any further changes should be made to the destination subvolume + +Both of these requirements require sending notificatoins to clients. Clients are notified by overloading an attribute used in every callback functions. DHT understands these attributes in the callbacks and can be notified if a file is being migrated or not. + +During rebalance, a file will be in two phases + +1. Migration in process - In this phase the file is being migrated by the rebalance process from the source subvolume to the destination subvolume. The rebalance process will set a 'in-migration' attribute on the file, which will notify the clients' DHT translator. The clients' DHT translator will then take care to send any further changes to the destination subvolume as well. This way we satisfy the first requirement + +2. Migration completed - Once the file has been migrated, the rebalance process will set a 'migration-complete' attribute on the file. The clients will be notified of the completion and all further operations on the file will happen on the destination subvolume. + +The DHT translator handles the above and allows the applications on the clients to continue working on a file under migration. diff --git a/doc/gluster.8 b/doc/gluster.8 index 3886d29a0..3c78fb8b1 100644 --- a/doc/gluster.8 +++ b/doc/gluster.8 @@ -1,18 +1,11 @@ -.\" Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> + +.\" Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> .\" This file is part of GlusterFS. .\" -.\" GlusterFS is GF_FREE software; you can redistribute it and/or modify -.\" it under the terms of the GNU Affero General Public License as published -.\" by the Free Software Foundation; either version 3 of the License, -.\" or (at your option) any later version. -.\" -.\" GlusterFS is distributed in the hope that it will be useful, but -.\" WITHOUT ANY WARRANTY; without even the implied warranty of -.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -.\" Affero General Public License for more details. -.\" -.\" You should have received a copy of the GNU Affero General Public License -.\" along with this program. If not, see " <http://www.gnu.org/licenses/>. +.\" This file is licensed to you under your choice of the GNU Lesser +.\" General Public License, version 3 or any later version (LGPLv3 or +.\" later), or the GNU General Public License, version 2 (GPLv2), in all +.\" cases as published by the Free Software Foundation. .\" .\" .TH Gluster 8 "Gluster command line utility" "07 March 2011" "Gluster Inc." @@ -43,8 +36,9 @@ The Gluster Console Manager is a command line utility for elastic volume managem \fB\ volume info [all|<VOLNAME>] \fR Display information about all volumes, or the specified volume. .TP -\fB\ volume create <NEW-VOLNAME> [stripe <COUNT>] [replica <COUNT>] [transport <tcp|rdma>] <NEW-BRICK> ... \fR +\fB\ volume create <NEW-VOLNAME> [stripe <COUNT>] [replica <COUNT>] [transport <tcp|rdma|tcp,rdma>] <NEW-BRICK> ... \fR Create a new volume of the specified type using the specified bricks and transport type (the default transport type is tcp). +To create a volume with both transports (tcp and rdma), give 'transport tcp,rdma' as an option. .TP \fB\ volume delete <VOLNAME> \fR Delete the specified volume. @@ -93,7 +87,7 @@ Replace the specified brick. \fB\ volume log filename <VOLNAME> [BRICK] <DIRECTORY> \fB Set the log directory for the corresponding volume/brick. .TP -\fB\volume log locate <VOLNAME> [BRICK] \fB +\fB\ volume log locate <VOLNAME> [BRICK] \fB Locate the log file for corresponding volume/brick. .TP \fB\ volume log rotate <VOLNAME> [BRICK] \fB @@ -120,7 +114,7 @@ Display the command options. Exit the gluster command line interface. .SH FILES -/etc/glusterd/* +/var/lib/glusterd/* .SH SEE ALSO .nf \fBfusermount\fR(1), \fBmount.glusterfs\fR(8), \fBglusterfs\fR(8), \fBglusterd\fR(8) diff --git a/doc/glusterd.8 b/doc/glusterd.8 index 43c1570f6..04a43481e 100644 --- a/doc/glusterd.8 +++ b/doc/glusterd.8 @@ -1,20 +1,11 @@ .\" -.\" Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> +.\" Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> .\" This file is part of GlusterFS. .\" -.\" GlusterFS is GF_FREE software; you can redistribute it and/or modify -.\" it under the terms of the GNU Affero General Public License as published -.\" by the Free Software Foundation; either version 3 of the License, -.\" or (at your option) any later version. -.\" -.\" GlusterFS is distributed in the hope that it will be useful, but -.\" WITHOUT ANY WARRANTY; without even the implied warranty of -.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -.\" Affero General Public License for more details. -.\" -.\" You should have received a copy of the GNU Affero General Public License -.\" along with this program. If not, see -.\" <http://www.gnu.org/licenses/>. +.\" This file is licensed to you under your choice of the GNU Lesser +.\" General Public License, version 3 or any later version (LGPLv3 or +.\" later), or the GNU General Public License, version 2 (GPLv2), in all +.\" cases as published by the Free Software Foundation. .\" .\" @@ -33,10 +24,10 @@ The glusterd daemon is used for elastic volume management. The daemon must be ru .PP .TP -\fB\-l=<LOGFILE>, \fB\-\-log\-file=<LOGFILE>\fR +\fB\-l <LOGFILE>, \fB\-\-log\-file=<LOGFILE>\fR File to use for logging. .TP -\fB\-L=<LOGLEVEL>, \fB\-\-log\-level=<LOGLEVEL>\fR +\fB\-L <LOGLEVEL>, \fB\-\-log\-level=<LOGLEVEL>\fR Logging severity. Valid options are TRACE, DEBUG, INFO, WARNING, ERROR and CRITICAL (the default is INFO). .TP \fB\-\-debug\fR @@ -59,7 +50,7 @@ Print the program version. .PP .SH FILES -/etc/glusterd/* +/var/lib/glusterd/* .SH SEE ALSO .nf diff --git a/doc/glusterd.vol b/doc/glusterd.vol deleted file mode 100644 index 4e43fb0da..000000000 --- a/doc/glusterd.vol +++ /dev/null @@ -1,8 +0,0 @@ -volume management - type mgmt/glusterd - option working-directory /etc/glusterd - option transport-type socket,rdma - option transport.socket.keepalive-time 10 - option transport.socket.keepalive-interval 2 -end-volume - diff --git a/doc/glusterfs.8 b/doc/glusterfs.8 index 7e4b2b49d..60ad5709b 100644 --- a/doc/glusterfs.8 +++ b/doc/glusterfs.8 @@ -1,19 +1,10 @@ -.\" Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> +.\" Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> .\" This file is part of GlusterFS. .\" -.\" GlusterFS is free software; you can redistribute it and/or modify -.\" it under the terms of the GNU Affero General Public License as published -.\" by the Free Software Foundation; either version 3 of the License, -.\" or (at your option) any later version. -.\" -.\" GlusterFS is distributed in the hope that it will be useful, but -.\" WITHOUT ANY WARRANTY; without even the implied warranty of -.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -.\" Affero General Public License for more details. -.\" -.\" You should have received a copy of the GNU Affero General Public License -.\" long with this program. If not, see -.\" <http://www.gnu.org/licenses/>. +.\" This file is licensed to you under your choice of the GNU Lesser +.\" General Public License, version 3 or any later version (LGPLv3 or +.\" later), or the GNU General Public License, version 2 (GPLv2), in all +.\" cases as published by the Free Software Foundation. .\" .\" .\" @@ -32,7 +23,8 @@ be made of any commodity hardware, such as x86-64 server with SATA-II RAID and Infiniband HBA. GlusterFS is fully POSIX compliant file system. On client side, it has dependency -on FUSE package, on server side, it works seemlessly on different operating systems. Currently supported on GNU/Linux and Solaris. +on FUSE package, on server side, it works seemlessly on different operating systems. +Currently supported on GNU/Linux and Solaris. .SH OPTIONS @@ -40,33 +32,55 @@ on FUSE package, on server side, it works seemlessly on different operating syst .PP .TP \fB\-f, \fB\-\-volfile=VOLUME-FILE\fR -File to use as VOLUME-FILE (the default is /etc/glusterfs/glusterfs.vol). +File to use as VOLUME-FILE. .TP \fB\-l, \fB\-\-log\-file=LOGFILE\fR -File to use for logging. +File to use for logging (the default is <INSTALL-DIR>/var/log/glusterfs/<MOUNT-POINT>.log). .TP \fB\-L, \fB\-\-log\-level=LOGLEVEL\fR -Logging severity. Valid options are TRACE, DEBUG, INFO, WARNING, ERROR and CRITICAL (the default is WARNING). +Logging severity. Valid options are TRACE, DEBUG, INFO, WARNING, ERROR and CRITICAL (the default is INFO). .TP \fB\-s, \fB\-\-volfile\-server=SERVER\fR Server to get the volume from. This option overrides \fB\-\-volfile \fR option. +.TP +\fB\-\-volfile\-max\-fetch\-attempts=MAX\-ATTEMPTS\fR +Maximum number of connect attempts to server. This option should be provided with +\fB\-\-volfile\-server\fR option (the default is 1). .SS "Advanced options" .PP .TP +\fB\-\-acl\fR +Mount the filesystem with POSIX ACL support. +.TP \fB\-\-debug\fR Run in debug mode. This option sets \fB\-\-no\-daemon\fR, \fB\-\-log\-level\fR to DEBUG, and \fB\-\-log\-file\fR to console. .TP +\fB\-\-enable\-ino32=BOOL\fR +Use 32-bit inodes when mounting to workaround application that doesn't support 64-bit inodes. +.TP +\fB\-\-fopen\-keep\-cache\fR +Do not purge the cache on file open. +.TP +\fB\-\-mac\-compat=BOOL\fR +Provide stubs for attributes needed for seamless operation on Macs (the default is off). +.TP \fB\-N, \fB\-\-no\-daemon\fR Run in the foreground. .TP -\fB\-\-read\-only\fR -Make the file system read-only. -.TP \fB\-p, \fB\-\-pid\-file=PIDFILE\fR File to use as PID file. .TP +\fB\-\-read\-only\fR +Mount the file system in 'read-only' mode. +.TP +\fB\-\-selinux\fR +Enable SELinux label (extended attributes) support on inodes. +.TP +\fB\-S, \fB\-\-socket\-file=SOCKFILE\fR +File to use as unix-socket. +.TP \fB\-\-volfile\-id=KEY\fR Key of the volume file to be fetched from the server. .TP @@ -74,11 +88,14 @@ Key of the volume file to be fetched from the server. Port number of volfile server. .TP \fB\-\-volfile\-server\-transport=TRANSPORT\fR -Transport type to get volume file from server (the deafult is socket). +Transport type to get volume file from server (the default is tcp). .TP \fB\-\-volume\-name=VOLUME\-NAME\fR Volume name to be used for MOUNT-POINT (the default is top most volume in VOLUME-FILE). .TP +\fB\-\-worm\fR +Mount the filesystem in 'worm' mode. +.TP \fB\-\-xlator\-option=VOLUME\-NAME.OPTION=VALUE\fR Add/Override a translator option for a volume with the specified value. @@ -89,11 +106,29 @@ Add/Override a translator option for a volume with the specified value. \fB\-\-attribute\-timeout=SECONDS\fR Set attribute timeout to SECONDS for inodes in fuse kernel module (the default is 1). .TP -\fB\-\-entry\-timeout=SECONDS\fR -Set entry timeout to SECONDS in fuse kernel module (the default is 1). +\fB\-\-background\-qlen=N\fR +Set fuse module's background queue length to N (the default is 64). +.TP +\fB\-\-congestion\-threshold=N\fR +Set fuse module's congestion threshold to N (the default is 48). .TP \fB\-\-direct\-io\-mode=BOOL\fR Enable/Disable the direct-I/O mode in fuse module (the default is enable). +.TP +\fB\-\-dump-fuse=PATH\f\R +Dump fuse traffic to PATH +.TP +\fB\-\-entry\-timeout=SECONDS\fR +Set entry timeout to SECONDS in fuse kernel module (the default is 1). +.TP +\fB\-\-gid\-timeout=SECONDS\fR +Set auxilary group list timeout to SECONDS for fuse translator (the default is 0). +.TP +\fB\-\-negative\-timeout=SECONDS\fR +Set negative timeout to SECONDS in fuse kernel module (the default is 0). +.TP +\fB\-\-volfile-check\fR +Enable strict volume file checking. .SS "Miscellaneous Options" .PP @@ -102,7 +137,7 @@ Enable/Disable the direct-I/O mode in fuse module (the default is enable). \fB\-?, \fB\-\-help\fR Display this help. .TP -\fB\-\-usage\fReew +\fB\-\-usage\fR Display a short usage message. .TP \fB\-V, \fB\-\-version\fR @@ -110,7 +145,12 @@ Print the program version. .PP .SH FILES -/etc/glusterfs/*.vol, /etc/glusterd/vols/*/*.vol +/var/lib/glusterd/vols/*/*.vol +.SH EXAMPLES +mount a volume named foo on server bar with log level DEBUG on mount point +/mnt/foo + +# glusterfs \-\-log\-level=DEBUG \-\-volfile\-id=foo \-\-volfile\-server=bar /mnt/foo .SH SEE ALSO .nf @@ -119,6 +159,6 @@ Print the program version. .fi .SH COPYRIGHT .nf -Copyright(c) 2006-2011 Gluster, Inc. <http://www.gluster.com> +Copyright(c) 2006-2011 Red Hat, Inc. <http://www.redhat.com> \fR .fi diff --git a/doc/glusterfs.vol.sample b/doc/glusterfs.vol.sample deleted file mode 100644 index 3b1f18517..000000000 --- a/doc/glusterfs.vol.sample +++ /dev/null @@ -1,61 +0,0 @@ -### file: client-volume.vol.sample - -##################################### -### GlusterFS Client Volume File ## -##################################### - -#### CONFIG FILE RULES: -### "#" is comment character. -### - Config file is case sensitive -### - Options within a volume block can be in any order. -### - Spaces or tabs are used as delimitter within a line. -### - Each option should end within a line. -### - Missing or commented fields will assume default values. -### - Blank/commented lines are allowed. -### - Sub-volumes should already be defined above before referring. - -### Add client feature and attach to remote subvolume -volume client - type protocol/client - option transport-type tcp -# option transport-type unix -# option transport-type ib-sdp - option remote-host 127.0.0.1 # IP address of the remote brick -# option transport.socket.remote-port 24016 - -# option transport-type ib-verbs -# option transport.ib-verbs.remote-port 24016 -# option transport.ib-verbs.work-request-send-size 1048576 -# option transport.ib-verbs.work-request-send-count 16 -# option transport.ib-verbs.work-request-recv-size 1048576 -# option transport.ib-verbs.work-request-recv-count 16 - -# option transport-timeout 30 # seconds to wait for a reply - # from server for each request - option remote-subvolume brick # name of the remote volume -end-volume - -### Add readahead feature -#volume readahead -# type performance/read-ahead -# option page-size 1MB # unit in bytes -# option page-count 2 # cache per file = (page-count x page-size) -# subvolumes client -#end-volume - -### Add IO-Cache feature -#volume iocache -# type performance/io-cache -# option page-size 256KB -# option page-count 2 -# subvolumes readahead -#end-volume - -### Add writeback feature -#volume writeback -# type performance/write-behind -# option aggregate-size 1MB -# option window-size 2MB -# option flush-behind off -# subvolumes iocache -#end-volume diff --git a/doc/glusterfsd.8 b/doc/glusterfsd.8 index dbf3e3fda..176d04236 100644 --- a/doc/glusterfsd.8 +++ b/doc/glusterfsd.8 @@ -1,19 +1,10 @@ -.\" Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> +.\" Copyright (c) 20088888888-2012 Red Hat, Inc. <http://www.redhat.com> .\" This file is part of GlusterFS. .\" -.\" GlusterFS is free software; you can redistribute it and/or modify -.\" it under the terms of the GNU Affero General Public License as published -.\" by the Free Software Foundation; either version 3 of the License, -.\" or (at your option) any later version. -.\" -.\" GlusterFS is distributed in the hope that it will be useful, but -.\" WITHOUT ANY WARRANTY; without even the implied warranty of -.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -.\" Affero General Public License for more details. -.\" -.\" You should have received a copy of the GNU Affero General Public License -.\" long with this program. If not, see -.\" <http://www.gnu.org/licenses/>. +.\" This file is licensed to you under your choice of the GNU Lesser +.\" General Public License, version 3 or any later version (LGPLv3 or +.\" later), or the GNU General Public License, version 2 (GPLv2), in all +.\" cases as published by the Free Software Foundation. .\" .\" .\" @@ -73,6 +64,15 @@ Makes the filesystem read-only \fB\-p, \fB\-\-pid\-file=PIDFILE\fR File to use as pid file .TP +\fB\-S SOCKFILE +Socket file to used for inter-process communication +.TP +\fB\-\-brick\-name DIRECTORY +Directory to be used as export directory for GlusterFS +.TP +\fB\-\-brick\-port PORT +Brick Port to be registered with Gluster portmapper +.TP \fB\-\-volfile\-id=KEY\fR KEY of the volume file to be fetched from server .TP @@ -80,7 +80,7 @@ KEY of the volume file to be fetched from server Port number of volfile server .TP \fB\-\-volfile\-server\-transport=TRANSPORT\fR -Transport type to get volume file from server [default: socket] +Transport type to get volume file from server [default: tcp] .TP \fB\-\-volume\-name=VOLUME\-NAME\fR Volume name to be used for MOUNT-POINT [default: top most volume in @@ -119,6 +119,11 @@ Print program version .SH FILES /etc/glusterfs/*.vol +.SH EXAMPLES +Start a GlusterFS server on localhost with volume name foo + +glusterfsd \-s localhost \-\-volfile\-id foo.server.media-disk\-1 \-p /var/lib/glusterd/vols/foo/run/server\-media\-disk\-1.pid \-S /tmp/<uniqueid>.socket \-\-brick-name /media/disk\-1 \-l /var/log/glusterfs/bricks/media\-disk\-1.log \-\-brick\-port 24009 \-\-xlator\-option foo\-server.listen-port=24009 + .SH SEE ALSO .nf \fBfusermount\fR(1), \fBmount.glusterfs\fR(8), \fBgluster\fR(8) diff --git a/doc/glusterfsd.vol.sample b/doc/glusterfsd.vol.sample deleted file mode 100644 index e91df3290..000000000 --- a/doc/glusterfsd.vol.sample +++ /dev/null @@ -1,47 +0,0 @@ -### file: server-volume.vol.sample - -##################################### -### GlusterFS Server Volume File ## -##################################### - -#### CONFIG FILE RULES: -### "#" is comment character. -### - Config file is case sensitive -### - Options within a volume block can be in any order. -### - Spaces or tabs are used as delimitter within a line. -### - Multiple values to options will be : delimitted. -### - Each option should end within a line. -### - Missing or commented fields will assume default values. -### - Blank/commented lines are allowed. -### - Sub-volumes should already be defined above before referring. - -### Export volume "brick" with the contents of "/home/export" directory. -volume brick - type storage/posix # POSIX FS translator - option directory /home/export # Export this directory -end-volume - -### Add network serving capability to above brick. -volume server - type protocol/server - option transport-type tcp -# option transport-type unix -# option transport-type ib-sdp -# option transport.socket.bind-address 192.168.1.10 # Default is to listen on all interfaces -# option transport.socket.listen-port 24016 - -# option transport-type ib-verbs -# option transport.ib-verbs.bind-address 192.168.1.10 # Default is to listen on all interfaces -# option transport.ib-verbs.listen-port 24016 -# option transport.ib-verbs.work-request-send-size 131072 -# option transport.ib-verbs.work-request-send-count 64 -# option transport.ib-verbs.work-request-recv-size 131072 -# option transport.ib-verbs.work-request-recv-count 64 - -# option client-volume-filename /etc/glusterfs/glusterfs-client.vol - subvolumes brick -# NOTE: Access to any volume through protocol/server is denied by -# default. You need to explicitly grant access through # "auth" -# option. - option auth.addr.brick.allow * # Allow access to "brick" volume -end-volume diff --git a/doc/hacker-guide/adding-fops.txt b/doc/hacker-guide/adding-fops.txt deleted file mode 100644 index e70dbbdc8..000000000 --- a/doc/hacker-guide/adding-fops.txt +++ /dev/null @@ -1,33 +0,0 @@ - HOW TO ADD A NEW FOP TO GlusterFS - ================================= - -Steps to be followed when adding a new FOP to GlusterFS: - -1. Edit glusterfs.h and add a GF_FOP_* constant. - -2. Edit xlator.[ch] and: - 2a. add the new prototype for fop and callback. - 2b. edit xlator_fops structure. - -3. Edit xlator.c and add to fill_defaults. - -4. Edit protocol.h and add struct necessary for the new FOP. - -5. Edit defaults.[ch] and provide default implementation. - -6. Edit call-stub.[ch] and provide stub implementation. - -7. Edit common-utils.c and add to gf_global_variable_init(). - -8. Edit client-protocol and add your FOP. - -9. Edit server-protocol and add your FOP. - -10. Implement your FOP in any translator for which the default implementation - is not sufficient. - -========================================== -Last updated: Mon Oct 27 21:35:49 IST 2008 - -Author: Vikas Gorur <vikas@gluster.com> -========================================== diff --git a/doc/hacker-guide/bdb.txt b/doc/hacker-guide/bdb.txt deleted file mode 100644 index fd0bd3652..000000000 --- a/doc/hacker-guide/bdb.txt +++ /dev/null @@ -1,70 +0,0 @@ - -* How does file translates to key/value pair? ---------------------------------------------- - - in bdb a file is identified by key (obtained by taking basename() of the path of -the file) and file contents are stored as value corresponding to the key in database -file (defaults to glusterfs_storage.db under dirname() directory). - -* symlinks, directories ------------------------ - - symlinks and directories are stored as is. - -* db (database) files ---------------------- - - every directory, including root directory, contains a database file called -glusterfs_storage.db. all the regular files contained in the directory are stored -as key/value pair inside the glusterfs_storage.db. - -* internal data cache ---------------------- - - db does not provide a way to find out the size of the value corresponding to a key. -so, bdb makes DB->get() call for key and takes the length of the value returned. -since DB->get() also returns file contents for key, bdb maintains an internal cache and -stores the file contents in the cache. - every directory maintains a seperate cache. - -* inode number transformation ------------------------------ - - bdb allocates a inode number to each file and directory on its own. bdb maintains a -global counter and increments it after allocating inode number for each file -(regular, symlink or directory). NOTE: bdb does not guarantee persistent inode numbers. - -* checkpoint thread -------------------- - - bdb creates a checkpoint thread at the time of init(). checkpoint thread does a -periodic checkpoint on the DB_ENV. checkpoint is the mechanism, provided by db, to -forcefully commit the logged transactions to the storage. - -NOTES ABOUT FOPS: ------------------ - -lookup() - - 1> do lstat() on the path, if lstat fails, we assume that the file being looked up - is either a regular file or doesn't exist. - 2> lookup in the DB of parent directory for key corresponding to path. if key exists, - return key, with. - NOTE: 'struct stat' stat()ed from DB file is used as a container for 'struct stat' - of the regular file. st_ino, st_size, st_blocks are updated with file's values. - -readv() - - 1> do a lookup in bctx cache. if successful, return the requested data from cache. - 2> if cache missed, do a DB->get() the entire file content and insert to cache. - -writev(): - 1> flush any cached content of this file. - 2> do a DB->put(), with DB_DBT_PARTIAL flag. - NOTE: DB_DBT_PARTIAL is used to do partial update of a value in DB. - -readdir(): - 1> regular readdir() in a loop, and vomit all DB_ENV log files and DB files that - we encounter. - 2> if the readdir() buffer still has space, open a DB cursor and do a sequential - DBC->get() to fill the reaadir buffer. - - diff --git a/doc/hacker-guide/en-US/markdown/adding-fops.md b/doc/hacker-guide/en-US/markdown/adding-fops.md new file mode 100644 index 000000000..3f72ed3e2 --- /dev/null +++ b/doc/hacker-guide/en-US/markdown/adding-fops.md @@ -0,0 +1,18 @@ +Adding a new FOP +================ + +Steps to be followed when adding a new FOP to GlusterFS: + +1. Edit `glusterfs.h` and add a `GF_FOP_*` constant. +2. Edit `xlator.[ch]` and: + * add the new prototype for fop and callback. + * edit `xlator_fops` structure. +3. Edit `xlator.c` and add to fill_defaults. +4. Edit `protocol.h` and add struct necessary for the new FOP. +5. Edit `defaults.[ch]` and provide default implementation. +6. Edit `call-stub.[ch]` and provide stub implementation. +7. Edit `common-utils.c` and add to gf_global_variable_init(). +8. Edit client-protocol and add your FOP. +9. Edit server-protocol and add your FOP. +10. Implement your FOP in any translator for which the default implementation + is not sufficient. diff --git a/doc/hacker-guide/en-US/markdown/afr.md b/doc/hacker-guide/en-US/markdown/afr.md new file mode 100644 index 000000000..1be7e39f2 --- /dev/null +++ b/doc/hacker-guide/en-US/markdown/afr.md @@ -0,0 +1,191 @@ +cluster/afr translator +====================== + +Locking +------- + +Before understanding replicate, one must understand two internal FOPs: + +### `GF_FILE_LK` + +This is exactly like `fcntl(2)` locking, except the locks are in a +separate domain from locks held by applications. + +### `GF_DIR_LK (loc_t *loc, char *basename)` + +This allows one to lock a name under a directory. For example, +to lock /mnt/glusterfs/foo, one would use the call: + +``` +GF_DIR_LK ({loc_t for "/mnt/glusterfs"}, "foo") +``` + +If one wishes to lock *all* the names under a particular directory, +supply the basename argument as `NULL`. + +The locks can either be read locks or write locks; consult the +function prototype for more details. + +Both these operations are implemented by the features/locks (earlier +known as posix-locks) translator. + +Basic design +------------ + +All FOPs can be classified into four major groups: + +### inode-read + +Operations that read an inode's data (file contents) or metadata (perms, etc.). + +access, getxattr, fstat, readlink, readv, stat. + +### inode-write + +Operations that modify an inode's data or metadata. + +chmod, chown, truncate, writev, utimens. + +### dir-read + +Operations that read a directory's contents or metadata. + +readdir, getdents, checksum. + +### dir-write + +Operations that modify a directory's contents or metadata. + +create, link, mkdir, mknod, rename, rmdir, symlink, unlink. + +Some of these make a subgroup in that they modify *two* different entries: +link, rename, symlink. + +### Others + +Other operations. + +flush, lookup, open, opendir, statfs. + +Algorithms +---------- + +Each of the four major groups has its own algorithm: + +### inode-read, dir-read + +1. Send a request to the first child that is up: + * if it fails: + * try the next available child + * if we have exhausted all children: + * return failure + +### inode-write + + All operations are done in parallel unless specified otherwise. + +1. Send a ``GF_FILE_LK`` request on all children for a write lock on the + appropriate region + (for metadata operations: entire file (0, 0) for writev: + (offset, offset+size of buffer)) + * If a lock request fails on a child: + * unlock all children + * try to acquire a blocking lock (`F_SETLKW`) on each child, serially. + If this fails (due to `ENOTCONN` or `EINVAL`): + Consider this child as dead for rest of transaction. +2. Mark all children as "pending" on all (alive) children (see below for +meaning of "pending"). + * If it fails on any child: + * mark it as dead (in transaction local state). +3. Perform operation on all (alive) children. + * If it fails on any child: + * mark it as dead (in transaction local state). +4. Unmark all successful children as not "pending" on all nodes. +5. Unlock region on all (alive) children. + +### dir-write + + The algorithm for dir-write is same as above except instead of holding + `GF_FILE_LK` locks we hold a GF_DIR_LK lock on the name being operated upon. + In case of link-type calls, we hold locks on both the operand names. + +"pending" +--------- + +The "pending" number is like a journal entry. A pending entry is an +array of 32-bit integers stored in network byte-order as the extended +attribute of an inode (which can be a directory as well). + +There are three keys corresponding to three types of pending operations: + +### `AFR_METADATA_PENDING` + +There are some metadata operations pending on this inode (perms, ctime/mtime, +xattr, etc.). + +### `AFR_DATA_PENDING` + +There is some data pending on this inode (writev). + +### `AFR_ENTRY_PENDING` + +There are some directory operations pending on this directory +(create, unlink, etc.). + +Self heal +--------- + +* On lookup, gather extended attribute data: + * If entry is a regular file: + * If an entry is present on one child and not on others: + * create entry on others. + * If entries exist but have different metadata (perms, etc.): + * consider the entry with the highest `AFR_METADATA_PENDING` number as + definitive and replicate its attributes on children. + * If entry is a directory: + * Consider the entry with the higest `AFR_ENTRY_PENDING` number as + definitive and replicate its contents on all children. + * If any two entries have non-matching types (i.e., one is file and + other is directory): + * Announce to the user via log that a split-brain situation has been + detected, and do nothing. +* On open, gather extended attribute data: + * Consider the file with the highest `AFR_DATA_PENDING` number as + the definitive one and replicate its contents on all other + children. + +During all self heal operations, appropriate locks must be held on all +regions/entries being affected. + +Inode scaling +------------- + +Inode scaling is necessary because if a situation arises where an inode number +is returned for a directory (by lookup) which was previously the inode number +of a file (as per FUSE's table), then FUSE gets horribly confused (consult a +FUSE expert for more details). + +To avoid such a situation, we distribute the 64-bit inode space equally +among all children of replicate. + +To illustrate: + +If c1, c2, c3 are children of replicate, they each get 1/3 of the available +inode space: + +------------- -- -- -- -- -- -- -- -- -- -- -- --- +Child: c1 c2 c3 c1 c2 c3 c1 c2 c3 c1 c2 ... +Inode number: 1 2 3 4 5 6 7 8 9 10 11 ... +------------- -- -- -- -- -- -- -- -- -- -- -- --- + +Thus, if lookup on c1 returns an inode number "2", it is scaled to "4" +(which is the second inode number in c1's space). + +This way we ensure that there is never a collision of inode numbers from +two different children. + +This reduction of inode space doesn't really reduce the usability of +replicate since even if we assume replicate has 1024 children (which would be a +highly unusual scenario), each child still has a 54-bit inode space: +$2^{54} \sim 1.8 \times 10^{16}$, which is much larger than any real +world requirement. diff --git a/doc/hacker-guide/en-US/markdown/coding-standard.md b/doc/hacker-guide/en-US/markdown/coding-standard.md new file mode 100644 index 000000000..178dc142a --- /dev/null +++ b/doc/hacker-guide/en-US/markdown/coding-standard.md @@ -0,0 +1,402 @@ +GlusterFS Coding Standards +========================== + +Structure definitions should have a comment per member +------------------------------------------------------ + +Every member in a structure definition must have a comment about its +purpose. The comment should be descriptive without being overly verbose. + +*Bad:* + +``` +gf_lock_t lock; /* lock */ +``` + +*Good:* + +``` +DBTYPE access_mode; /* access mode for accessing + * the databases, can be + * DB_HASH, DB_BTREE + * (option access-mode <mode>) + */ +``` + +Declare all variables at the beginning of the function +------------------------------------------------------ + +All local variables in a function must be declared immediately after the +opening brace. This makes it easy to keep track of memory that needs to be freed +during exit. It also helps debugging, since gdb cannot handle variables +declared inside loops or other such blocks. + +Always initialize local variables +--------------------------------- + +Every local variable should be initialized to a sensible default value +at the point of its declaration. All pointers should be initialized to NULL, +and all integers should be zero or (if it makes sense) an error value. + + +*Good:* + +``` +int ret = 0; +char *databuf = NULL; +int _fd = -1; +``` + +Initialization should always be done with a constant value +---------------------------------------------------------- + +Never use a non-constant expression as the initialization value for a variable. + + +*Bad:* + +``` +pid_t pid = frame->root->pid; +char *databuf = malloc (1024); +``` + +Validate all arguments to a function +------------------------------------ + +All pointer arguments to a function must be checked for `NULL`. +A macro named `VALIDATE` (in `common-utils.h`) +takes one argument, and if it is `NULL`, writes a log message and +jumps to a label called `err` after setting op_ret and op_errno +appropriately. It is recommended to use this template. + + +*Good:* + +``` +VALIDATE(frame); +VALIDATE(this); +VALIDATE(inode); +``` + +Never rely on precedence of operators +------------------------------------- + +Never write code that relies on the precedence of operators to execute +correctly. Such code can be hard to read and someone else might not +know the precedence of operators as accurately as you do. + +*Bad:* + +``` +if (op_ret == -1 && errno != ENOENT) +``` + +*Good:* + +``` +if ((op_ret == -1) && (errno != ENOENT)) +``` + +Use exactly matching types +-------------------------- + +Use a variable of the exact type declared in the manual to hold the +return value of a function. Do not use an ``equivalent'' type. + + +*Bad:* + +``` +int len = strlen (path); +``` + +*Good:* + +``` +size_t len = strlen (path); +``` + +Never write code such as `foo->bar->baz`; check every pointer +------------------------------------------------------------- + +Do not write code that blindly follows a chain of pointer +references. Any pointer in the chain may be `NULL` and thus +cause a crash. Verify that each pointer is non-null before following +it. + +Check return value of all functions and system calls +---------------------------------------------------- + +The return value of all system calls and API functions must be checked +for success or failure. + +*Bad:* + +``` +close (fd); +``` + +*Good:* + +``` +op_ret = close (_fd); +if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "close on file %s failed (%s)", real_path, + strerror (errno)); + op_errno = errno; + goto out; +} +``` + + +Gracefully handle failure of malloc +----------------------------------- + +GlusterFS should never crash or exit due to lack of memory. If a +memory allocation fails, the call should be unwound and an error +returned to the user. + +*Use result args and reserve the return value to indicate success or failure:* + +The return value of every functions must indicate success or failure (unless +it is impossible for the function to fail --- e.g., boolean functions). If +the function needs to return additional data, it must be returned using a +result (pointer) argument. + +*Bad:* + +``` +int32_t dict_get_int32 (dict_t *this, char *key); +``` + +*Good:* + +``` +int dict_get_int32 (dict_t *this, char *key, int32_t *val); +``` + +Always use the `n' versions of string functions +----------------------------------------------- + +Unless impossible, use the length-limited versions of the string functions. + +*Bad:* + +``` +strcpy (entry_path, real_path); +``` + +*Good:* + +``` +strncpy (entry_path, real_path, entry_path_len); +``` + +No dead or commented code +------------------------- + +There must be no dead code (code to which control can never be passed) or +commented out code in the codebase. + +Only one unwind and return per function +--------------------------------------- + +There must be only one exit out of a function. `UNWIND` and return +should happen at only point in the function. + +Function length or Keep functions small +--------------------------------------- + +We live in the UNIX-world where modules do one thing and do it well. +This rule should apply to our functions also. If a function is very long, try splitting it +into many little helper functions. The question is, in a coding +spree, how do we know a function is long and unreadable. One rule of +thumb given by Linus Torvalds is that, a function should be broken-up +if you have 4 or more levels of indentation going on for more than 3-4 +lines. + +*Example for a helper function:* +``` +static int +same_owner (posix_lock_t *l1, posix_lock_t *l2) +{ + return ((l1->client_pid == l2->client_pid) && + (l1->transport == l2->transport)); +} +``` + +Defining functions as static +---------------------------- + +Define internal functions as static only if you're +very sure that there will not be a crash(..of any kind..) emanating in +that function. If there is even a remote possibility, perhaps due to +pointer derefering, etc, declare the function as non-static. This +ensures that when a crash does happen, the function name shows up the +in the back-trace generated by libc. However, doing so has potential +for polluting the function namespace, so to avoid conflicts with other +components in other parts, ensure that the function names are +prepended with a prefix that identify the component to which it +belongs. For eg. non-static functions in io-threads translator start +with iot_. + +Ensure function calls wrap around after 80-columns +-------------------------------------------------- + +Place remaining arguments on the next line if needed. + +Functions arguments and function definition +------------------------------------------- + +Place all the arguments of a function definition on the same line +until the line goes beyond 80-cols. Arguments that extend beyind +80-cols should be placed on the next line. + +Style issues +------------ + +### Brace placement + +Use K&R/Linux style of brace placement for blocks. + +*Good:* + +``` +int some_function (...) +{ + if (...) { + /* ... */ + } else if (...) { + /* ... */ + } else { + /* ... */ + } + + do { + /* ... */ + } while (cond); +} +``` + +### Indentation + +Use *eight* spaces for indenting blocks. Ensure that your +file contains only spaces and not tab characters. You can do this +in Emacs by selecting the entire file (`C-x h`) and +running `M-x untabify`. + +To make Emacs indent lines automatically by eight spaces, add this +line to your `.emacs`: + +``` +(add-hook 'c-mode-hook (lambda () (c-set-style "linux"))) +``` + +### Comments + +Write a comment before every function describing its purpose (one-line), +its arguments, and its return value. Mention whether it is an internal +function or an exported function. + +Write a comment before every structure describing its purpose, and +write comments about each of its members. + +Follow the style shown below for comments, since such comments +can then be automatically extracted by doxygen to generate +documentation. + +*Good:* + +``` +/** +* hash_name -hash function for filenames +* @par: parent inode number +* @name: basename of inode +* @mod: number of buckets in the hashtable +* +* @return: success: bucket number +* failure: -1 +* +* Not for external use. +*/ +``` + +### Indicating critical sections + +To clearly show regions of code which execute with locks held, use +the following format: + +``` +pthread_mutex_lock (&mutex); +{ + /* code */ +} +pthread_mutex_unlock (&mutex); +``` + +*A skeleton fop function:* + +This is the recommended template for any fop. In the beginning come +the initializations. After that, the `success' control flow should be +linear. Any error conditions should cause a `goto` to a single +point, `out`. At that point, the code should detect the error +that has occured and do appropriate cleanup. + +``` +int32_t +sample_fop (call_frame_t *frame, xlator_t *this, ...) +{ + char * var1 = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + DIR * dir = NULL; + struct posix_fd * pfd = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + + /* other validations */ + + dir = opendir (...); + + if (dir == NULL) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "opendir failed on %s (%s)", loc->path, + strerror (op_errno)); + goto out; + } + + /* another system call */ + if (...) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + /* ... */ + + out: + if (op_ret == -1) { + + /* check for all the cleanup that needs to be + done */ + + if (dir) { + closedir (dir); + dir = NULL; + } + + if (pfd) { + FREE (pfd->path); + FREE (pfd); + pfd = NULL; + } + } + + STACK_UNWIND (frame, op_ret, op_errno, fd); + return 0; +} +``` diff --git a/doc/hacker-guide/en-US/markdown/posix.md b/doc/hacker-guide/en-US/markdown/posix.md new file mode 100644 index 000000000..84c813e55 --- /dev/null +++ b/doc/hacker-guide/en-US/markdown/posix.md @@ -0,0 +1,59 @@ +storage/posix translator +======================== + +Notes +----- + +### `SET_FS_ID` + +This is so that all filesystem checks are done with the user's +uid/gid and not GlusterFS's uid/gid. + +### `MAKE_REAL_PATH` + +This macro concatenates the base directory of the posix volume +('option directory') with the given path. + +### `need_xattr` in lookup + +If this flag is passed, lookup returns a xattr dictionary that contains +the file's create time, the file's contents, and the version number +of the file. + +This is a hack to increase small file performance. If an application +wants to read a small file, it can finish its job with just a lookup +call instead of a lookup followed by read. + +### `getdents`/`setdents` + +These are used by unify to set and get directory entries. + +### `ALIGN_BUF` + +Macro to align an address to a page boundary (4K). + +### `priv->export_statfs` + +In some cases, two exported volumes may reside on the same +partition on the server. Sending statvfs info for both +the volumes will lead to erroneous df output at the client, +since free space on the partition will be counted twice. + +In such cases, user can disable exporting statvfs info +on one of the volumes by setting this option. + +### `xattrop` + +This fop is used by replicate to set version numbers on files. + +### `getxattr`/`setxattr` hack to read/write files + +A key, `GLUSTERFS_FILE_CONTENT_STRING`, is handled in a special way by +`getxattr`/`setxattr`. A getxattr with the key will return the entire +content of the file as the value. A `setxattr` with the key will write +the value as the entire content of the file. + +### `posix_checksum` + +This calculates a simple XOR checksum on all entry names in a +directory that is used by unify to compare directory contents. diff --git a/doc/hacker-guide/en-US/markdown/translator-development.md b/doc/hacker-guide/en-US/markdown/translator-development.md new file mode 100644 index 000000000..77d1b606a --- /dev/null +++ b/doc/hacker-guide/en-US/markdown/translator-development.md @@ -0,0 +1,666 @@ +Translator development +====================== + +Setting the Stage +----------------- + +This is the first post in a series that will explain some of the details of +writing a GlusterFS translator, using some actual code to illustrate. + +Before we begin, a word about environments. GlusterFS is over 300K lines of +code spread across a few hundred files. That's no Linux kernel or anything, but + you're still going to be navigating through a lot of code in every +code-editing session, so some kind of cross-referencing is *essential*. I use +cscope with the vim bindings, and if I couldn't do Crtl+G and such to jump +between definitions all the time my productivity would be cut in half. You may +prefer different tools, but as I go through these examples you'll need +something functionally similar to follow on. OK, on with the show. + +The first thing you need to know is that translators are not just bags of +functions and variables. They need to have a very definite internal structure +so that the translator-loading code can figure out where all the pieces are. +The way it does this is to use dlsym to look for specific names within your +shared-object file, as follow (from `xlator.c`): + +``` +if (!(xl->fops = dlsym (handle, "fops"))) { + gf_log ("xlator", GF_LOG_WARNING, "dlsym(fops) on %s", + dlerror ()); + goto out; +} + +if (!(xl->cbks = dlsym (handle, "cbks"))) { + gf_log ("xlator", GF_LOG_WARNING, "dlsym(cbks) on %s", + dlerror ()); + goto out; +} + +if (!(xl->init = dlsym (handle, "init"))) { + gf_log ("xlator", GF_LOG_WARNING, "dlsym(init) on %s", + dlerror ()); + goto out; +} + +if (!(xl->fini = dlsym (handle, "fini"))) { + gf_log ("xlator", GF_LOG_WARNING, "dlsym(fini) on %s", + dlerror ()); + goto out; +} +``` + +In this example, `xl` is a pointer to the in-memory object for the translator +we're loading. As you can see, it's looking up various symbols *by name* in the + shared object it just loaded, and storing pointers to those symbols. Some of +them (e.g. init are functions, while others e.g. fops are dispatch tables +containing pointers to many functions. Together, these make up the translator's + public interface. + +Most of this glue or boilerplate can easily be found at the bottom of one of +the source files that make up each translator. We're going to use the `rot-13` +translator just for fun, so in this case you'd look in `rot-13.c` to see this: + +``` +struct xlator_fops fops = { + .readv = rot13_readv, + .writev = rot13_writev +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { +{ .key = {"encrypt-write"}, + .type = GF_OPTION_TYPE_BOOL +}, +{ .key = {"decrypt-read"}, + .type = GF_OPTION_TYPE_BOOL +}, +{ .key = {NULL} }, +}; +``` + +The `fops` table, defined in `xlator.h`, is one of the most important pieces. +This table contains a pointer to each of the filesystem functions that your +translator might implement -- `open`, `read`, `stat`, `chmod`, and so on. There +are 82 such functions in all, but don't worry; any that you don't specify here +will be see as null and filled with defaults from `defaults.c` when your +translator is loaded. In this particular example, since `rot-13` is an +exceptionally simple translator, we only fill in two entries for `readv` and +`writev`. + +There are actually two other tables, also required to have predefined names, +that are also used to find translator functions: `cbks` (which is empty in this + snippet) and `dumpops` (which is missing entirely). The first of these specify + entry points for when inodes are forgotten or file descriptors are released. +In other words, they're destructors for objects in which your translator might + have an interest. Mostly you can ignore them, because the default behavior +handles even the simpler cases of translator-specific inode/fd context +automatically. However, if the context you attach is a complex structure +requiring complex cleanup, you'll need to supply these functions. As for +dumpops, that's just used if you want to provide functions to pretty-print +various structures in logs. I've never used it myself, though I probably +should. What's noteworthy here is that we don't even define dumpops. That's +because all of the functions that might use these dispatch functions will check + for `xl->dumpops` being `NULL` before calling through it. This is in sharp +contrast to the behavior for `fops` and `cbks1`, which *must* be present. If +they're not, translator loading will fail because these pointers are not +checked every time and if they're `NULL` then we'll segfault. That's why we +provide an empty definition for cbks; it's OK for the individual function +pointers to be NULL, but not for the whole table to be absent. + +The last piece I'll cover today is options. As you can see, this is a table of +translator-specific option names and some information about their types. +GlusterFS actually provides a pretty rich set of types (`volume_option_type_t` +in `options.`h) which includes paths, translator names, percentages, and times +in addition to the obvious integers and strings. Also, the `volume_option_t` +structure can include information about alternate names, min/max/default +values, enumerated string values, and descriptions. We don't see any of these +here, so let's take a quick look at some more complex examples from afr.c and +then come back to `rot-13`. + +``` +{ .key = {"data-self-heal-algorithm"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "", + .description = "Select between \"full\", \"diff\". The " + "\"full\" algorithm copies the entire file from " + "source to sink. The \"diff\" algorithm copies to " + "sink only those blocks whose checksums don't match " + "with those of source.", + .value = { "diff", "full", "" } +}, +{ .key = {"data-self-heal-window-size"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 1024, + .default_value = "1", + .description = "Maximum number blocks per file for which " + "self-heal process would be applied simultaneously." +}, +``` + +When your translator is loaded, all of this information is used to parse the +options actually provided in the volfile, and then the result is turned into a +dictionary and stored as `xl->options`. This dictionary is then processed by +your init function, which you can see being looked up in the first code +fragment above. We're only going to look at a small part of the `rot-13`'s +init for now. + +``` +priv->decrypt_read = 1; +priv->encrypt_write = 1; + +data = dict_get (this->options, "encrypt-write"); +if (data) { + if (gf_string2boolean (data->data, &priv->encrypt_write + == -1) { + gf_log (this->name, GF_LOG_ERROR, + "encrypt-write takes only boolean options"); + return -1; + } +} +``` + +What we can see here is that we're setting some defaults in our priv structure, +then looking to see if an `encrypt-write` option was actually provided. If so, +we convert and store it. This is a pretty classic use of dict_get to fetch a +field from a dictionary, and of using one of many conversion functions in +`common-utils.c` to convert `data->data` into something we can use. + +So far we've covered the basic of how a translator gets loaded, how we find its +various parts, and how we process its options. In my next Translator 101 post, +we'll go a little deeper into other things that init and its companion fini +might do, and how some other fields in our `xlator_t` structure (commonly +referred to as this) are commonly used. + +`init`, `fini`, and private context +----------------------------------- + +In the previous Translator 101 post, we looked at some of the dispatch tables +and options processing in a translator. This time we're going to cover the rest + of the "shell" of a translator -- i.e. the other global parts not specific to +handling a particular request. + +Let's start by looking at the relationship between a translator and its shared +library. At a first approximation, this is the relationship between an object +and a class in just about any object-oriented programming language. The class +defines behaviors, but has to be instantiated as an object to have any kind of +existence. In our case the object is an `xlator_t`. Several of these might be +created within the same daemon, sharing all of the same code through init/fini +and dispatch tables, but sharing *no data*. You could implement shared data (as + static variables in your shared libraries) but that's strongly discouraged. +Every function in your shared library will get an `xlator_t` as an argument, +and should use it. This lack of class-level data is one of the points where +the analogy to common OOP systems starts to break down. Another place is the +complete lack of inheritance. Translators inherit behavior (code) from exactly +one shared library -- looked up and loaded using the `type` field in a volfile +`volume ... end-volume` block -- and that's it -- not even single inheritance, +no subclasses or superclasses, no mixins or prototypes, just the relationship +between an object and its class. With that in mind, let's turn to the init +function that we just barely touched on last time. + +``` +int32_t +init (xlator_t *this) +{ + data_t *data = NULL; + rot_13_private_t *priv = NULL; + + if (!this->children || this->children->next) { + gf_log ("rot13", GF_LOG_ERROR, + "FATAL: rot13 should have exactly one child"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + priv = GF_CALLOC (sizeof (rot_13_private_t), 1, 0); + if (!priv) + return -1; +``` + +At the very top, we see the function signature -- we get a pointer to the +`xlator_t` object that we're initializing, and we return an `int32_t` status. +As with most functions in the translator API, this should be zero to indicate +success. In this case it's safe to return -1 for failure, but watch out: in +dispatch-table functions, the return value means the status of the *function +call* rather than the *request*. A request error should be reflected as a +callback with a non-zero `op_re`t value, but the dispatch function itself +should still return zero. In fact, the handling of a non-zero return from a +dispatch function is not all that robust (we recently had a bug report in +HekaFS related to this) so it's something you should probably avoid +altogether. This only underscores the difference between dispatch functions +and `init`/`fini` functions, where non-zero returns *are* expected and handled +logically by aborting the translator setup. We can see that down at the +bottom, where we return -1 to indicate that we couldn't allocate our +private-data area (more about that later). + +The first thing this init function does is check that the translator is being +set up in the right kind of environment. Translators are called by parents and +in turn call children. Some translators are "initial" translators that inject +requests into the system from elsewhere -- e.g. mount/fuse injecting requests +from the kernel, protocol/server injecting requests from the network. Those +translators don't need parents, but `rot-13` does and so we check for that. +Similarly, some translators are "final" translators that (from the perspective +of the current process) terminate requests instead of passing them on -- e.g. +`protocol/client` passing them to another node, `storage/posix` passing them to +a local filesystem. Other translators "multiplex" between multiple children -- + passing each parent request on to one (`cluster/dht`), some +(`cluster/stripe`), or all (`cluster/afr`) of those children. `rot-13` fits +into none of those categories either, so it checks that it has *exactly one* +child. It might be more convenient or robust if translator shared libraries +had standard variables describing these requirements, to be checked in a +consistent way by the translator-loading infrastructure itself instead of by +each separate init function, but this is the way translators work today. + +The last thing we see in this fragment is allocating our private data area. +This can literally be anything we want; the infrastructure just provides the +priv pointer as a convenience but takes no responsibility for how it's used. In + this case we're using `GF_CALLOC` to allocate our own `rot_13_private_t` +structure. This gets us all the benefits of GlusterFS's memory-leak detection +infrastructure, but the way we're calling it is not quite ideal. For one thing, + the first two arguments -- from `calloc(3)` -- are kind of reversed. For +another, notice how the last argument is zero. That can actually be an +enumerated value, to tell the GlusterFS allocator *what* type we're +allocating. This can be very useful information for memory profiling and leak +detection, so it's recommended that you follow the example of any +x`xx-mem-types.h` file elsewhere in the source tree instead of just passing +zero here (even though that works). + +To finish our tour of standard initialization/termination, let's look at the +end of `init` and the beginning of `fini`: + +``` + this->private = priv; + gf_log ("rot13", GF_LOG_DEBUG, "rot13 xlator loaded"); + return 0; +} + +void +fini (xlator_t *this) +{ + rot_13_private_t *priv = this->private; + + if (!priv) + return; + this->private = NULL; + GF_FREE (priv); +``` + +At the end of init we're just storing our private-data pointer in the `priv` +field of our `xlator_t`, then returning zero to indicate that initialization +succeeded. As is usually the case, our fini is even simpler. All it really has +to do is `GF_FREE` our private-data pointer, which we do in a slightly +roundabout way here. Notice how we don't even have a return value here, since +there's nothing obvious and useful that the infrastructure could do if `fini` +failed. + +That's practically everything we need to know to get our translator through +loading, initialization, options processing, and termination. If we had defined + no dispatch functions, we could actually configure a daemon to use our +translator and it would work as a basic pass-through from its parent to a +single child. In the next post I'll cover how to build the translator and +configure a daemon to use it, so that we can actually step through it in a +debugger and see how it all fits together before we actually start adding +functionality. + +This Time For Real +------------------ + +In the first two parts of this series, we learned how to write a basic +translator skeleton that can get through loading, initialization, and option +processing. This time we'll cover how to build that translator, configure a +volume to use it, and run the glusterfs daemon in debug mode. + +Unfortunately, there's not much direct support for writing new translators. You +can check out a GlusterFS tree and splice in your own translator directory, but + that's a bit painful because you'll have to update multiple makefiles plus a +bunch of autoconf garbage. As part of the HekaFS project, I basically reverse +engineered the truly necessary parts of the translator-building process and +then pestered one of the Fedora glusterfs package maintainers (thanks +daMaestro!) to add a `glusterfs-devel` package with the required headers. Since + then the complexity level in the HekaFS tree has crept back up a bit, but I +still remember the simple method and still consider it the easiest way to get +started on a new translator. For the sake of those not using Fedora, I'm going +to describe a method that doesn't depend on that header package. What it does +depend on is a GlusterFS source tree, much as you might have cloned from GitHub + or the Gluster review site. This tree doesn't have to be fully built, but you +do need to run `autogen.sh` and configure in it. Then you can take the +following simple makefile and put it in a directory with your actual source. + +``` +# Change these to match your source code. +TARGET = rot-13.so +OBJECTS = rot-13.o + +# Change these to match your environment. +GLFS_SRC = /srv/glusterfs +GLFS_LIB = /usr/lib64 +HOST_OS = GF_LINUX_HOST_OS + +# You shouldn't need to change anything below here. + +CFLAGS = -fPIC -Wall -O0 -g \ + -DHAVE_CONFIG_H -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE \ + -D$(HOST_OS) -I$(GLFS_SRC) -I$(GLFS_SRC)/contrib/uuid \ + -I$(GLFS_SRC)/libglusterfs/src +LDFLAGS = -shared -nostartfiles -L$(GLFS_LIB) -lglusterfs \ + -lpthread + +$(TARGET): $(OBJECTS) + $(CC) $(OBJECTS) $(LDFLAGS) -o $(TARGET) +``` + +Yes, it's still Linux-specific. Mea culpa. As you can see, we're sticking with +the `rot-13` example, so you can just copy the files from +`xlators/encryption/rot-13/src` in your GlusterFS tree to follow on. Type +`make` and you should be rewarded with a nice little `.so` file. + +``` +xlator_example$ ls -l rot-13.so +-rwxr-xr-x. 1 jeff jeff 40784 Nov 16 16:41 rot-13.so +``` + +Notice that we've built with optimization level zero and debugging symbols +included, which would not typically be the case for a packaged version of +GlusterFS. Let's put our version of `rot-13.so` into a slightly different file +on our system, so that it doesn't stomp on the installed version (not that +you'd ever want to use that anyway). + +``` +xlator_example# ls /usr/lib64/glusterfs/3git/xlator/encryption/ +crypt.so crypt.so.0 crypt.so.0.0.0 rot-13.so rot-13.so.0 +rot-13.so.0.0.0 +xlator_example# cp rot-13.so \ + /usr/lib64/glusterfs/3git/xlator/encryption/my-rot-13.so +``` + +These paths represent the current Gluster filesystem layout, which is likely to +be deprecated in favor of the Fedora layout; your paths may vary. At this point + we're ready to configure a volume using our new translator. To do that, I'm +going to suggest something that's strongly discouraged except during +development (the Gluster guys are going to hate me for this): write our own +volfile. Here's just about the simplest volfile you'll ever see. + +``` +volume my-posix + type storage/posix + option directory /srv/export +end-volume + +volume my-rot13 + type encryption/my-rot-13 + subvolumes my-posix +end-volume +``` + +All we have here is a basic brick using `/srv/export` for its data, and then +an instance of our translator layered on top -- no client or server is +necessary for what we're doing, and the system will automatically push a +mount/fuse translator on top if there's no server translator. To try this out, +all we need is the following command (assuming the directories involved already + exist). + +``` +xlator_example$ glusterfs --debug -f my.vol /srv/import +``` + +You should be rewarded with a whole lot of log output, including the text of +the volfile (this is very useful for debugging problems in the field). If you +go to another window on the same machine, you can see that you have a new +filesystem mounted. + +``` +~$ df /srv/import +Filesystem 1K-blocks Used Available Use% Mounted on +/srv/xlator_example/my.vol + 114506240 2706176 105983488 3% /srv/import +``` + +Just for fun, write something into a file in `/srv/import`, then look at the +corresponding file in `/srv/export` to see it all `rot-13`'ed for you. + +``` +~$ echo hello > /srv/import/a_file +~$ cat /srv/export/a_file +uryyb +``` + +There you have it -- functionality you control, implemented easily, layered on +top of local storage. Now you could start adding functionality -- real +encryption, perhaps -- and inevitably having to debug it. You could do that the + old-school way, with `gf_log` (preferred) or even plain old `printf`, or you +could run daemons under `gdb` instead. Alternatively, you could wait for the +next Translator 101 post, where we'll be doing exactly that. + +Debugging a Translator +---------------------- + +Now that we've learned what a translator looks like and how to build one, it's +time to run one and actually watch it work. The best way to do this is good +old-fashioned `gdb`, as follows (using some of the examples from last time). + +``` +xlator_example# gdb glusterfs +GNU gdb (GDB) Red Hat Enterprise Linux (7.2-50.el6) +... +(gdb) r --debug -f my.vol /srv/import +Starting program: /usr/sbin/glusterfs --debug -f my.vol /srv/import +... +[2011-11-23 11:23:16.495516] I [fuse-bridge.c:2971:fuse_init] + 0-glusterfs-fuse: FUSE inited with protocol versions: + glusterfs 7.13 kernel 7.13 +``` + +If you get to this point, your glusterfs client process is already running. You +can go to another window to see the mountpoint, do file operations, etc. + +``` +~# df /srv/import +Filesystem 1K-blocks Used Available Use% Mounted on +/root/xlator_example/my.vol + 114506240 2643968 106045568 3% /srv/import +~# ls /srv/import +a_file +~# cat /srv/import/a_file +hello +``` + +Now let's interrupt the process and see where we are. + +``` +^C +Program received signal SIGINT, Interrupt. +0x0000003a0060b3dc in pthread_cond_wait@@GLIBC_2.3.2 () + from /lib64/libpthread.so.0 +(gdb) info threads + 5 Thread 0x7fffeffff700 (LWP 27206) 0x0000003a002dd8c7 + in readv () + from /lib64/libc.so.6 + 4 Thread 0x7ffff50e3700 (LWP 27205) 0x0000003a0060b75b + in pthread_cond_timedwait@@GLIBC_2.3.2 () + from /lib64/libpthread.so.0 + 3 Thread 0x7ffff5f02700 (LWP 27204) 0x0000003a0060b3dc + in pthread_cond_wait@@GLIBC_2.3.2 () + from /lib64/libpthread.so.0 + 2 Thread 0x7ffff6903700 (LWP 27203) 0x0000003a0060f245 + in sigwait () + from /lib64/libpthread.so.0 +* 1 Thread 0x7ffff7957700 (LWP 27196) 0x0000003a0060b3dc + in pthread_cond_wait@@GLIBC_2.3.2 () + from /lib64/libpthread.so.0 +``` + +Like any non-toy server, this one has multiple threads. What are they all +doing? Honestly, even I don't know. Thread 1 turns out to be in +`event_dispatch_epoll`, which means it's the one handling all of our network +I/O. Note that with socket multi-threading patch this will change, with one +thread in `socket_poller` per connection. Thread 2 is in `glusterfs_sigwaiter` +which means signals will be isolated to that thread. Thread 3 is in +`syncenv_task`, so it's a worker process for synchronous requests such as +those used by the rebalance and repair code. Thread 4 is in +`janitor_get_next_fd`, so it's waiting for a chance to close no-longer-needed +file descriptors on the local filesystem. (I admit I had to look that one up, +BTW.) Lastly, thread 5 is in `fuse_thread_proc`, so it's the one fetching +requests from our FUSE interface. You'll often see many more threads than +this, but it's a pretty good basic set. Now, let's set a breakpoint so we can +actually watch a request. + +``` +(gdb) b rot13_writev +Breakpoint 1 at 0x7ffff50e4f0b: file rot-13.c, line 119. +(gdb) c +Continuing. +``` + +At this point we go into our other window and do something that will involve a write. + +``` +~# echo goodbye > /srv/import/another_file +(back to the first window) +[Switching to Thread 0x7fffeffff700 (LWP 27206)] + +Breakpoint 1, rot13_writev (frame=0x7ffff6e4402c, this=0x638440, + fd=0x7ffff409802c, vector=0x7fffe8000cd8, count=1, offset=0, + iobref=0x7fffe8001070) at rot-13.c:119 +119 rot_13_private_t *priv = (rot_13_private_t *)this->private; +``` + +Remember how we built with debugging symbols enabled and no optimization? That +will be pretty important for the next few steps. As you can see, we're in +`rot13_writev`, with several parameters. + +* `frame` is our always-present frame pointer for this request. Also, + `frame->local` will point to any local data we created and attached to the + request ourselves. +* `this` is a pointer to our instance of the `rot-13` translator. You can examine + it if you like to see the name, type, options, parent/children, inode table, + and other stuff associated with it. +* `fd` is a pointer to a file-descriptor *object* (`fd_t`, not just a + file-descriptor index which is what most people use "fd" for). This in turn + points to an inode object (`inode_t`) and we can associate our own + `rot-13`-specific data with either of these. +* `vector` and `count` together describe the data buffers for this write, which + we'll get to in a moment. +* `offset` is the offset into the file at which we're writing. +* `iobref` is a buffer-reference object, which is used to track the life cycle + of buffers containing read/write data. If you look closely, you'll notice that + `vector[0].iov_base` points to the same address as `iobref->iobrefs[0].ptr`, which + should give you some idea of the inter-relationships between vector and iobref. + +OK, now what about that `vector`? We can use it to examine the data being +written, like this. + +``` +(gdb) p vector[0] +$2 = {iov_base = 0x7ffff7936000, iov_len = 8} +(gdb) x/s 0x7ffff7936000 +0x7ffff7936000: "goodbye\n" +``` + +It's not always safe to view this data as a string, because it might just as +well be binary data, but since we're generating the write this time it's safe +and convenient. With that knowledge, let's step through things a bit. + +``` +(gdb) s +120 if (priv->encrypt_write) +(gdb) +121 rot13_iovec (vector, count); +(gdb) +rot13_iovec (vector=0x7fffe8000cd8, count=1) at rot-13.c:57 +57 for (i = 0; i < count; i++) { +(gdb) +58 rot13 (vector[i].iov_base, vector[i].iov_len); +(gdb) +rot13 (buf=0x7ffff7936000 "goodbye\n", len=8) at rot-13.c:45 +45 for (i = 0; i < len; i++) { +(gdb) +46 if (buf[i] >= 'a' && buf[i] <= 'z') +(gdb) +47 buf[i] = 'a' + ((buf[i] - 'a' + 13) % 26); +``` + +Here we've stepped into `rot13_iovec`, which iterates through our vector +calling `rot13`, which in turn iterates through the characters in that chunk +doing the `rot-13` operation if/as appropriate. This is pretty straightforward +stuff, so let's skip to the next interesting bit. + +``` +(gdb) fin +Run till exit from #0 rot13 (buf=0x7ffff7936000 "goodbye\n", + len=8) at rot-13.c:47 +rot13_iovec (vector=0x7fffe8000cd8, count=1) at rot-13.c:57 +57 for (i = 0; i < count; i++) { +(gdb) fin +Run till exit from #0 rot13_iovec (vector=0x7fffe8000cd8, + count=1) at rot-13.c:57 +rot13_writev (frame=0x7ffff6e4402c, this=0x638440, + fd=0x7ffff409802c, vector=0x7fffe8000cd8, count=1, + offset=0, iobref=0x7fffe8001070) at rot-13.c:123 +123 STACK_WIND (frame, +(gdb) b 129 +Breakpoint 2 at 0x7ffff50e4f35: file rot-13.c, line 129. +(gdb) b rot13_writev_cbk +Breakpoint 3 at 0x7ffff50e4db3: file rot-13.c, line 106. +(gdb) c +``` + +So we've set breakpoints on both the callback and the statement following the +`STACK_WIND`. Which one will we hit first? + +``` +Breakpoint 3, rot13_writev_cbk (frame=0x7ffff6e4402c, + cookie=0x7ffff6e440d8, this=0x638440, op_ret=8, op_errno=0, + prebuf=0x7fffefffeca0, postbuf=0x7fffefffec30) + at rot-13.c:106 +106 STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, + prebuf, postbuf); +(gdb) bt +#0 rot13_writev_cbk (frame=0x7ffff6e4402c, + cookie=0x7ffff6e440d8, this=0x638440, op_ret=8, op_errno=0, + prebuf=0x7fffefffeca0, postbuf=0x7fffefffec30) + at rot-13.c:106 +#1 0x00007ffff52f1b37 in posix_writev (frame=0x7ffff6e440d8, + this=<value optimized out>, fd=<value optimized out>, + vector=<value optimized out>, count=1, + offset=<value optimized out>, iobref=0x7fffe8001070) + at posix.c:2217 +#2 0x00007ffff50e513e in rot13_writev (frame=0x7ffff6e4402c, + this=0x638440, fd=0x7ffff409802c, vector=0x7fffe8000cd8, + count=1, offset=0, iobref=0x7fffe8001070) at rot-13.c:123 +``` + +Surprise! We're in `rot13_writev_cbk` now, called (indirectly) while we're +still in `rot13_writev` before `STACK_WIND` returns (still at rot-13.c:123). If + you did any request cleanup here, then you need to be careful about what you +do in the remainder of `rot13_writev` because data may have been freed etc. +It's tempting to say you should just do the cleanup in `rot13_writev` after +the `STACK_WIND,` but that's not valid because it's also possible that some +other translator returned without calling `STACK_UNWIND` -- i.e. before +`rot13_writev` is called, so then it would be the one getting null-pointer +errors instead. To put it another way, the callback and the return from +`STACK_WIND` can occur in either order or even simultaneously on different +threads. Even if you were to use reference counts, you'd have to make sure to +use locking or atomic operations to avoid races, and it's not worth it. Unless +you *really* understand the possible flows of control and know what you're +doing, it's better to do cleanup in the callback and nothing after +`STACK_WIND.` + +At this point all that's left is a `STACK_UNWIND` and a return. The +`STACK_UNWIND` invokes our parent's completion callback, and in this case our +parent is FUSE so at that point the VFS layer is notified of the write being +complete. Finally, we return through several levels of normal function calls +until we come back to fuse_thread_proc, which waits for the next request. + +So that's it. For extra fun, you might want to repeat this exercise by stepping +through some other call -- stat or setxattr might be good choices -- but you'll + have to use a translator that actually implements those calls to see much +that's interesting. Then you'll pretty much know everything I knew when I +started writing my first for-real translators, and probably even a bit more. I +hope you've enjoyed this series, or at least found it useful, and if you have +any suggestions for other topics I should cover please let me know (via +comments or email, IRC or Twitter). diff --git a/doc/hacker-guide/en-US/markdown/write-behind.md b/doc/hacker-guide/en-US/markdown/write-behind.md new file mode 100644 index 000000000..e20682249 --- /dev/null +++ b/doc/hacker-guide/en-US/markdown/write-behind.md @@ -0,0 +1,56 @@ +performance/write-behind translator +=================================== + +Basic working +-------------- + +Write behind is basically a translator to lie to the application that the +write-requests are finished, even before it is actually finished. + +On a regular translator tree without write-behind, control flow is like this: + +1. application makes a `write()` system call. +2. VFS ==> FUSE ==> `/dev/fuse`. +3. fuse-bridge initiates a glusterfs `writev()` call. +4. `writev()` is `STACK_WIND()`ed upto client-protocol or storage translator. +5. client-protocol, on receiving reply from server, starts `STACK_UNWIND()` towards the fuse-bridge. + +On a translator tree with write-behind, control flow is like this: + +1. application makes a `write()` system call. +2. VFS ==> FUSE ==> `/dev/fuse`. +3. fuse-bridge initiates a glusterfs `writev()` call. +4. `writev()` is `STACK_WIND()`ed upto write-behind translator. +5. write-behind adds the write buffer to its internal queue and does a `STACK_UNWIND()` towards the fuse-bridge. + +write call is completed in application's percepective. after +`STACK_UNWIND()`ing towards the fuse-bridge, write-behind initiates a fresh +writev() call to its child translator, whose replies will be consumed by +write-behind itself. Write-behind _doesn't_ cache the write buffer, unless +`option flush-behind on` is specified in volume specification file. + +Windowing +--------- + +With respect to write-behind, each write-buffer has three flags: `stack_wound`, `write_behind` and `got_reply`. + +* `stack_wound`: if set, indicates that write-behind has initiated `STACK_WIND()` towards child translator. +* `write_behind`: if set, indicates that write-behind has done `STACK_UNWIND()` towards fuse-bridge. +* `got_reply`: if set, indicates that write-behind has received reply from child translator for a `writev()` `STACK_WIND()`. a request will be destroyed by write-behind only if this flag is set. + +Currently pending write requests = aggregate size of requests with write_behind = 1 and got_reply = 0. + +window size limits the aggregate size of currently pending write requests. once +the pending requests' size has reached the window size, write-behind blocks +writev() calls from fuse-bridge. Blocking is only from application's +perspective. Write-behind does `STACK_WIND()` to child translator +straight-away, but hold behind the `STACK_UNWIND()` towards fuse-bridge. +`STACK_UNWIND()` is done only once write-behind gets enough replies to +accomodate for currently blocked request. + +Flush behind +------------ + +If `option flush-behind on` is specified in volume specification file, then +write-behind sends aggregate write requests to child translator, instead of +regular per request `STACK_WIND()`s. diff --git a/doc/hacker-guide/lock-ahead.txt b/doc/hacker-guide/lock-ahead.txt deleted file mode 100644 index 63392b7fa..000000000 --- a/doc/hacker-guide/lock-ahead.txt +++ /dev/null @@ -1,80 +0,0 @@ - Lock-ahead translator - --------------------- - -The objective of the lock-ahead translator is to speculatively -hold locks (inodelk and entrylk) on the universal set (0 - infinity -in case of inodelk and all basenames in case of entrylk) even -when a lock is requested only on a subset, in anticipation that -further locks will be requested within the same universal set. - -So, for example, when cluster/replicate locks a region before -writing to it, lock-ahead would instead lock the entire file. -On further writes, lock-ahead can immediately return success for -the lock requests, since the entire file has been previously locked. - -To avoid starvation of other clients/mountpoints, we employ a -notify mechanism, described below. - -typedef struct { - struct list_head subset_locks; -} la_universal_lock_t; - -Universal lock structure is stored in the inode context. - -typedef struct { - enum {LOCK_AHEAD_ENTRYLK, LOCK_AHEAD_FENTRYLK, - LOCK_AHEAD_INODELK, LOCK_AHEAD_FINODELK}; - - union { - fd_t *fd; - loc_t loc; - }; - - off_t l_start; - off_t l_len; - - const char *basename; - - struct list_head universal_lock; -} la_subset_lock_t; - - -fops implemented: - -* inodelk/finodelk/entrylk/fentrylk: - -lock: - if universal lock held: - add subset to it (save loc_t or fd) and return success - else: - send lock-notify fop - hold universal lock and return - (set inode context, add subset to it, save loc_t or fd) - - if this fails: - forward the lock request - -unlock: - if subset exists in universal lock: - delete subset lock from list - else: - forward it - -* release: - hold subset locks (each subset lock using the saved loc_t or fd) - and release universal lock - -* lock-notify (on unwind) (new fop) - hold subset locks and release universal lock - - -lock-notify in locks translator: - -if a subset lock in entrylk/inodelk cannot be satisfied -because of a universal lock held by someone else: - unwind the lock-notify fop - -============================================== -$ Last updated: Tue Feb 17 11:31:18 IST 2009 $ -$ Author: Vikas Gorur <vikas@gluster.com> $ -============================================== diff --git a/doc/hacker-guide/posix.txt b/doc/hacker-guide/posix.txt deleted file mode 100644 index d0132abfe..000000000 --- a/doc/hacker-guide/posix.txt +++ /dev/null @@ -1,59 +0,0 @@ ---------------- -* storage/posix ---------------- - -- SET_FS_ID - - This is so that all filesystem checks are done with the user's - uid/gid and not GlusterFS's uid/gid. - -- MAKE_REAL_PATH - - This macro concatenates the base directory of the posix volume - ('option directory') with the given path. - -- need_xattr in lookup - - If this flag is passed, lookup returns a xattr dictionary that contains - the file's create time, the file's contents, and the version number - of the file. - - This is a hack to increase small file performance. If an application - wants to read a small file, it can finish its job with just a lookup - call instead of a lookup followed by read. - -- getdents/setdents - - These are used by unify to set and get directory entries. - -- ALIGN_BUF - - Macro to align an address to a page boundary (4K). - -- priv->export_statfs - - In some cases, two exported volumes may reside on the same - partition on the server. Sending statvfs info for both - the volumes will lead to erroneous df output at the client, - since free space on the partition will be counted twice. - - In such cases, user can disable exporting statvfs info - on one of the volumes by setting this option. - -- xattrop - - This fop is used by replicate to set version numbers on files. - -- getxattr/setxattr hack to read/write files - - A key, GLUSTERFS_FILE_CONTENT_STRING, is handled in a special way by - getxattr/setxattr. A getxattr with the key will return the entire - content of the file as the value. A setxattr with the key will write - the value as the entire content of the file. - -- posix_checksum - - This calculates a simple XOR checksum on all entry names in a - directory that is used by unify to compare directory contents. - - diff --git a/doc/hacker-guide/write-behind.txt b/doc/hacker-guide/write-behind.txt deleted file mode 100644 index a6e9a8890..000000000 --- a/doc/hacker-guide/write-behind.txt +++ /dev/null @@ -1,45 +0,0 @@ -basic working --------------- - - write behind is basically a translator to lie to the application that the write-requests are finished, even before it is actually finished. - - on a regular translator tree without write-behind, control flow is like this: - - 1. application makes a write() system call. - 2. VFS ==> FUSE ==> /dev/fuse. - 3. fuse-bridge initiates a glusterfs writev() call. - 4. writev() is STACK_WIND()ed upto client-protocol or storage translator. - 5. client-protocol, on receiving reply from server, starts STACK_UNWIND() towards the fuse-bridge. - - on a translator tree with write-behind, control flow is like this: - - 1. application makes a write() system call. - 2. VFS ==> FUSE ==> /dev/fuse. - 3. fuse-bridge initiates a glusterfs writev() call. - 4. writev() is STACK_WIND()ed upto write-behind translator. - 5. write-behind adds the write buffer to its internal queue and does a STACK_UNWIND() towards the fuse-bridge. - - write call is completed in application's percepective. after STACK_UNWIND()ing towards the fuse-bridge, write-behind initiates a fresh writev() call to its child translator, whose replies will be consumed by write-behind itself. write-behind _doesn't_ cache the write buffer, unless 'option flush-behind on' is specified in volume specification file. - -windowing ---------- - - write respect to write-behind, each write-buffer has three flags: 'stack_wound', 'write_behind' and 'got_reply'. - - stack_wound: if set, indicates that write-behind has initiated STACK_WIND() towards child translator. - - write_behind: if set, indicates that write-behind has done STACK_UNWIND() towards fuse-bridge. - - got_reply: if set, indicates that write-behind has received reply from child translator for a writev() STACK_WIND(). a request will be destroyed by write-behind only if this flag is set. - - currently pending write requests = aggregate size of requests with write_behind = 1 and got_reply = 0. - - window size limits the aggregate size of currently pending write requests. once the pending requests' size has reached the window size, write-behind blocks writev() calls from fuse-bridge. - blocking is only from application's perspective. write-behind does STACK_WIND() to child translator straight-away, but hold behind the STACK_UNWIND() towards fuse-bridge. STACK_UNWIND() is done only once write-behind gets enough replies to accomodate for currently blocked request. - -flush behind ------------- - - if 'option flush-behind on' is specified in volume specification file, then write-behind sends aggregate write requests to child translator, instead of regular per request STACK_WIND()s. - - diff --git a/doc/user-guide/Makefile.am b/doc/legacy/Makefile.am index 800e7321d..b2caabaa2 100644 --- a/doc/user-guide/Makefile.am +++ b/doc/legacy/Makefile.am @@ -1,3 +1,3 @@ info_TEXINFOS = user-guide.texi -CLEANFILES = *~ +CLEANFILES = *~ DISTCLEANFILES = .deps/*.P *.info *vti diff --git a/doc/user-guide/advanced-stripe.odg b/doc/legacy/advanced-stripe.odg Binary files differindex 7686d7091..7686d7091 100644 --- a/doc/user-guide/advanced-stripe.odg +++ b/doc/legacy/advanced-stripe.odg diff --git a/doc/user-guide/advanced-stripe.pdf b/doc/legacy/advanced-stripe.pdf Binary files differindex ec8b03dcf..ec8b03dcf 100644 --- a/doc/user-guide/advanced-stripe.pdf +++ b/doc/legacy/advanced-stripe.pdf diff --git a/doc/booster.txt b/doc/legacy/booster.txt index 684ac8965..051401a28 100644 --- a/doc/booster.txt +++ b/doc/legacy/booster.txt @@ -1,6 +1,6 @@ Introduction ============ -* booster is a LD_PRELOADable library which boosts read/write performance by bypassing fuse for +* booster is a LD_PRELOADable library which boosts read/write performance by bypassing fuse for read() and write() calls. Requirements @@ -14,29 +14,29 @@ Design * contents of client volume-file. * mount point. -* LD_PRELOADed booster.so maintains an hash table storing mount-points and libglusterfsclient handles - so that handles are reused for files from same mount point. +* LD_PRELOADed booster.so maintains an hash table storing mount-points and libglusterfsclient handles + so that handles are reused for files from same mount point. * it also maintains a fdtable. fdtable maps the fd (integer) returned to application to fd (pointer to fd struct) used by libglusterfsclient. application is returned the same fd as the one returned from libc apis. * During fork, these tables are overwritten to enable creation of fresh glusterfs context in child. - + Working ======= -* application willing to use booster LD_PRELOADs booster.so which is a wrapper library implementing +* application willing to use booster LD_PRELOADs booster.so which is a wrapper library implementing open, read and write. -* application should specify the path to logfile through the environment variable GLFS_BOOSTER_LOGFILE. If +* application should specify the path to logfile through the environment variable GLFS_BOOSTER_LOGFILE. If not specified, logging is done to /dev/stderr. * open call does, * real_open on the file. * fgetxattr(fd). - * store the volume-file content got in the dictionary to a temparory file. - * look in the hashtable for the mount-point, if already present get the libglusterfsclient handle from the - hashtable. Otherwise get a new handle from libglusterfsclient (be careful about mount point not present in - the hashtable and multiple glusterfs_inits running simultaneously for the same mount-point there by using + * store the volume-file content got in the dictionary to a temporary file. + * look in the hashtable for the mount-point, if already present get the libglusterfsclient handle from the + hashtable. Otherwise get a new handle from libglusterfsclient (be careful about mount point not present in + the hashtable and multiple glusterfs_inits running simultaneously for the same mount-point there by using multiple handles for the same mount point). * real_close (fd). * delete temporary volume-volfile. @@ -51,4 +51,4 @@ Working * close call does, * remove the fd from the fdtable. -* other calls use real_calls. +* other calls use real_calls. diff --git a/doc/user-guide/colonO-icon.jpg b/doc/legacy/colonO-icon.jpg Binary files differindex 3e66f7a27..3e66f7a27 100644 --- a/doc/user-guide/colonO-icon.jpg +++ b/doc/legacy/colonO-icon.jpg diff --git a/doc/errno.list.bsd.txt b/doc/legacy/errno.list.bsd.txt index 350af25e4..350af25e4 100644 --- a/doc/errno.list.bsd.txt +++ b/doc/legacy/errno.list.bsd.txt diff --git a/doc/errno.list.linux.txt b/doc/legacy/errno.list.linux.txt index baa50792d..cc868644b 100644 --- a/doc/errno.list.linux.txt +++ b/doc/legacy/errno.list.linux.txt @@ -95,7 +95,7 @@ extern "C" { /** * @defgroup apr_errno Error Codes - * @ingroup APR + * @ingroup APR * @{ */ @@ -110,7 +110,7 @@ typedef int apr_status_t; * @param buf A buffer to hold the error string. * @param bufsize Size of the buffer to hold the string. */ -APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, +APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, apr_size_t bufsize); #if defined(DOXYGEN) @@ -130,7 +130,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * Fold an apr_status_t code back to the native platform defined error. * @param e The apr_status_t folded platform os error code. * @warning macro implementation; the statcode argument may be evaluated - * multiple times. If the statcode was not created by apr_get_os_error + * multiple times. If the statcode was not created by apr_get_os_error * or APR_FROM_OS_ERROR, the results are undefined. */ #define APR_TO_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e - APR_OS_START_SYSERR) @@ -166,7 +166,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * @warning This is a macro implementation; the statcode argument may be evaluated * multiple times. If the statcode was not created by apr_get_os_error * or APR_FROM_OS_ERROR, the results are undefined. This macro sets - * errno, or calls a WSASetLastError() style function, unfolding + * errno, or calls a WSASetLastError() style function, unfolding * socketcode with APR_TO_OS_ERROR. */ @@ -206,12 +206,12 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_OS_START_CANONERR (APR_OS_START_USERERR \ + (APR_OS_ERRSPACE_SIZE * 10)) /** - * APR_OS_START_EAIERR folds EAI_ error codes from getaddrinfo() into + * APR_OS_START_EAIERR folds EAI_ error codes from getaddrinfo() into * apr_status_t values. */ #define APR_OS_START_EAIERR (APR_OS_START_CANONERR + APR_OS_ERRSPACE_SIZE) /** - * APR_OS_START_SYSERR folds platform-specific system error values into + * APR_OS_START_SYSERR folds platform-specific system error values into * apr_status_t values. */ #define APR_OS_START_SYSERR (APR_OS_START_EAIERR + APR_OS_ERRSPACE_SIZE) @@ -219,13 +219,13 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, /** no error. */ #define APR_SUCCESS 0 -/** +/** * @defgroup APR_Error APR Error Values * <PRE> * <b>APR ERROR VALUES</b> - * APR_ENOSTAT APR was unable to perform a stat on the file + * APR_ENOSTAT APR was unable to perform a stat on the file * APR_ENOPOOL APR was not provided a pool with which to allocate memory - * APR_EBADDATE APR was given an invalid date + * APR_EBADDATE APR was given an invalid date * APR_EINVALSOCK APR was given an invalid socket * APR_ENOPROC APR was not given a process structure * APR_ENOTIME APR was not given a time structure @@ -236,7 +236,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * APR_ENOTHREAD APR was not given a thread structure * APR_ENOTHDKEY APR was not given a thread key structure * APR_ENOSHMAVAIL There is no more shared memory available - * APR_EDSOOPEN APR was unable to open the dso object. For more + * APR_EDSOOPEN APR was unable to open the dso object. For more * information call apr_dso_error(). * APR_EGENERAL General failure (specific information not available) * APR_EBADIP The specified IP address is invalid @@ -256,17 +256,17 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * APR_INCOMPLETE The operation was incomplete although some processing * was performed and the results are partially valid * APR_BADCH Getopt found an option not in the option string - * APR_BADARG Getopt found an option that is missing an argument + * APR_BADARG Getopt found an option that is missing an argument * and an argument was specified in the option string * APR_EOF APR has encountered the end of the file * APR_NOTFOUND APR was unable to find the socket in the poll structure * APR_ANONYMOUS APR is using anonymous shared memory * APR_FILEBASED APR is using a file name as the key to the shared memory * APR_KEYBASED APR is using a shared key as the key to the shared memory - * APR_EINIT Ininitalizer value. If no option has been found, but + * APR_EINIT Ininitalizer value. If no option has been found, but * the status variable requires a value, this should be used - * APR_ENOTIMPL The APR function has not been implemented on this - * platform, either because nobody has gotten to it yet, + * APR_ENOTIMPL The APR function has not been implemented on this + * platform, either because nobody has gotten to it yet, * or the function is impossible on this platform. * APR_EMISMATCH Two passwords do not match. * APR_EABSOLUTE The given path was absolute. @@ -334,7 +334,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_ENOTENOUGHENTROPY (APR_OS_START_ERROR + 28) /** @} */ -/** +/** * @defgroup APR_STATUS_IS Status Value Tests * @warning For any particular error condition, more than one of these tests * may match. This is because platform-specific error codes may not @@ -345,16 +345,16 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * adjust the order of the tests accordingly. * @{ */ -/** - * APR was unable to perform a stat on the file +/** + * APR was unable to perform a stat on the file * @warning always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_ENOSTAT(s) ((s) == APR_ENOSTAT) -/** - * APR was not provided a pool with which to allocate memory +/** + * APR was not provided a pool with which to allocate memory * @warning always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_ENOPOOL(s) ((s) == APR_ENOPOOL) /** APR was given an invalid date */ @@ -386,8 +386,8 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, /** The specified netmask is invalid */ #define APR_STATUS_IS_EBADMASK(s) ((s) == APR_EBADMASK) /* empty slot: +18 */ -/** - * APR was unable to open the dso object. +/** + * APR was unable to open the dso object. * For more information call apr_dso_error(). */ #if defined(WIN32) @@ -425,7 +425,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, /** @} */ -/** +/** * @addtogroup APR_Error * @{ */ @@ -466,7 +466,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, /** @see APR_STATUS_IS_KEYBASED */ #define APR_KEYBASED (APR_OS_START_STATUS + 21) /** @see APR_STATUS_IS_EINIT */ -#define APR_EINIT (APR_OS_START_STATUS + 22) +#define APR_EINIT (APR_OS_START_STATUS + 22) /** @see APR_STATUS_IS_ENOTIMPL */ #define APR_ENOTIMPL (APR_OS_START_STATUS + 23) /** @see APR_STATUS_IS_EMISMATCH */ @@ -475,156 +475,156 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_EBUSY (APR_OS_START_STATUS + 25) /** @} */ -/** +/** * @addtogroup APR_STATUS_IS * @{ */ -/** - * Program is currently executing in the child +/** + * Program is currently executing in the child * @warning * always use this test, as platform-specific variances may meet this * more than one error code */ #define APR_STATUS_IS_INCHILD(s) ((s) == APR_INCHILD) -/** - * Program is currently executing in the parent +/** + * Program is currently executing in the parent * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_INPARENT(s) ((s) == APR_INPARENT) -/** - * The thread is detached +/** + * The thread is detached * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_DETACH(s) ((s) == APR_DETACH) -/** - * The thread is not detached +/** + * The thread is not detached * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_NOTDETACH(s) ((s) == APR_NOTDETACH) -/** +/** * The child has finished executing * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_CHILD_DONE(s) ((s) == APR_CHILD_DONE) -/** +/** * The child has not finished executing * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_CHILD_NOTDONE(s) ((s) == APR_CHILD_NOTDONE) -/** +/** * The operation did not finish before the timeout * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP) -/** +/** * The operation was incomplete although some processing was performed * and the results are partially valid. * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_INCOMPLETE(s) ((s) == APR_INCOMPLETE) /* empty slot: +9 */ /* empty slot: +10 */ /* empty slot: +11 */ -/** +/** * Getopt found an option not in the option string * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_BADCH(s) ((s) == APR_BADCH) -/** - * Getopt found an option not in the option string and an argument was +/** + * Getopt found an option not in the option string and an argument was * specified in the option string * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_BADARG(s) ((s) == APR_BADARG) -/** +/** * APR has encountered the end of the file * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_EOF(s) ((s) == APR_EOF) -/** +/** * APR was unable to find the socket in the poll structure * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_NOTFOUND(s) ((s) == APR_NOTFOUND) /* empty slot: +16 */ /* empty slot: +17 */ /* empty slot: +18 */ -/** +/** * APR is using anonymous shared memory * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_ANONYMOUS(s) ((s) == APR_ANONYMOUS) -/** +/** * APR is using a file name as the key to the shared memory * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_FILEBASED(s) ((s) == APR_FILEBASED) -/** +/** * APR is using a shared key as the key to the shared memory * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_KEYBASED(s) ((s) == APR_KEYBASED) -/** - * Ininitalizer value. If no option has been found, but +/** + * Ininitalizer value. If no option has been found, but * the status variable requires a value, this should be used * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_EINIT(s) ((s) == APR_EINIT) -/** - * The APR function has not been implemented on this - * platform, either because nobody has gotten to it yet, +/** + * The APR function has not been implemented on this + * platform, either because nobody has gotten to it yet, * or the function is impossible on this platform. * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_ENOTIMPL(s) ((s) == APR_ENOTIMPL) -/** +/** * Two passwords do not match. * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_EMISMATCH(s) ((s) == APR_EMISMATCH) -/** +/** * The given lock was busy * @warning always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_EBUSY(s) ((s) == APR_EBUSY) /** @} */ -/** +/** * @addtogroup APR_Error APR Error Values * @{ */ @@ -713,8 +713,8 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_ESPIPE (APR_OS_START_CANONERR + 12) #endif -/** - * @see APR_STATUS_IS_EAGAIN +/** + * @see APR_STATUS_IS_EAGAIN * @warning use APR_STATUS_IS_EAGAIN instead of just testing this value */ #ifdef EAGAIN @@ -753,7 +753,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_EINPROGRESS (APR_OS_START_CANONERR + 17) #endif -/** +/** * @see APR_STATUS_IS_ECONNABORTED * @warning use APR_STATUS_IS_ECONNABORTED instead of just testing this value */ @@ -771,7 +771,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_ECONNRESET (APR_OS_START_CANONERR + 19) #endif -/** @see APR_STATUS_IS_ETIMEDOUT +/** @see APR_STATUS_IS_ETIMEDOUT * @deprecated */ #ifdef ETIMEDOUT #define APR_ETIMEDOUT ETIMEDOUT @@ -849,7 +849,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, */ #define APR_OS2_STATUS(e) (APR_FROM_OS_ERROR(e)) -/* These can't sit in a private header, so in spite of the extra size, +/* These can't sit in a private header, so in spite of the extra size, * they need to be made available here. */ #define SOCBASEERR 10000 @@ -946,10 +946,10 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, || (s) == APR_OS_START_SYSERR + SOCECONNRESET) /* XXX deprecated */ #define APR_STATUS_IS_ETIMEDOUT(s) ((s) == APR_ETIMEDOUT \ - || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) + || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) #undef APR_STATUS_IS_TIMEUP #define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP \ - || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) + || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) #define APR_STATUS_IS_EHOSTUNREACH(s) ((s) == APR_EHOSTUNREACH \ || (s) == APR_OS_START_SYSERR + SOCEHOSTUNREACH) #define APR_STATUS_IS_ENETUNREACH(s) ((s) == APR_ENETUNREACH \ @@ -1182,7 +1182,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define apr_get_netos_error() (errno) #define apr_set_netos_error(e) (errno = (e)) -/** +/** * @addtogroup APR_STATUS_IS * @{ */ @@ -1246,15 +1246,15 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, /** operation now in progress */ #define APR_STATUS_IS_EINPROGRESS(s) ((s) == APR_EINPROGRESS) -/** - * Software caused connection abort +/** + * Software caused connection abort * @remark - * EPROTO on certain older kernels really means ECONNABORTED, so we need to + * EPROTO on certain older kernels really means ECONNABORTED, so we need to * ignore it for them. See discussion in new-httpd archives nh.9701 & nh.9603 * - * There is potentially a bug in Solaris 2.x x<6, and other boxes that + * There is potentially a bug in Solaris 2.x x<6, and other boxes that * implement tcp sockets in userland (i.e. on top of STREAMS). On these - * systems, EPROTO can actually result in a fatal loop. See PR#981 for + * systems, EPROTO can actually result in a fatal loop. See PR#981 for * example. It's hard to handle both uses of EPROTO. */ #ifdef EPROTO diff --git a/doc/errno.list.macosx.txt b/doc/legacy/errno.list.macosx.txt index 728753ac7..4954e03d8 100644 --- a/doc/errno.list.macosx.txt +++ b/doc/legacy/errno.list.macosx.txt @@ -34,7 +34,7 @@ extern "C" { /** * @defgroup apr_errno Error Codes - * @ingroup APR + * @ingroup APR * @{ */ @@ -49,7 +49,7 @@ typedef int apr_status_t; * @param buf A buffer to hold the error string. * @param bufsize Size of the buffer to hold the string. */ -APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, +APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, apr_size_t bufsize); #if defined(DOXYGEN) @@ -69,7 +69,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * Fold an apr_status_t code back to the native platform defined error. * @param e The apr_status_t folded platform os error code. * @warning macro implementation; the statcode argument may be evaluated - * multiple times. If the statcode was not created by apr_get_os_error + * multiple times. If the statcode was not created by apr_get_os_error * or APR_FROM_OS_ERROR, the results are undefined. */ #define APR_TO_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e - APR_OS_START_SYSERR) @@ -105,7 +105,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * @warning This is a macro implementation; the statcode argument may be evaluated * multiple times. If the statcode was not created by apr_get_os_error * or APR_FROM_OS_ERROR, the results are undefined. This macro sets - * errno, or calls a WSASetLastError() style function, unfolding + * errno, or calls a WSASetLastError() style function, unfolding * socketcode with APR_TO_OS_ERROR. */ @@ -145,12 +145,12 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_OS_START_CANONERR (APR_OS_START_USERERR \ + (APR_OS_ERRSPACE_SIZE * 10)) /** - * APR_OS_START_EAIERR folds EAI_ error codes from getaddrinfo() into + * APR_OS_START_EAIERR folds EAI_ error codes from getaddrinfo() into * apr_status_t values. */ #define APR_OS_START_EAIERR (APR_OS_START_CANONERR + APR_OS_ERRSPACE_SIZE) /** - * APR_OS_START_SYSERR folds platform-specific system error values into + * APR_OS_START_SYSERR folds platform-specific system error values into * apr_status_t values. */ #define APR_OS_START_SYSERR (APR_OS_START_EAIERR + APR_OS_ERRSPACE_SIZE) @@ -158,13 +158,13 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, /** no error. */ #define APR_SUCCESS 0 -/** +/** * @defgroup APR_Error APR Error Values * <PRE> * <b>APR ERROR VALUES</b> - * APR_ENOSTAT APR was unable to perform a stat on the file + * APR_ENOSTAT APR was unable to perform a stat on the file * APR_ENOPOOL APR was not provided a pool with which to allocate memory - * APR_EBADDATE APR was given an invalid date + * APR_EBADDATE APR was given an invalid date * APR_EINVALSOCK APR was given an invalid socket * APR_ENOPROC APR was not given a process structure * APR_ENOTIME APR was not given a time structure @@ -175,7 +175,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * APR_ENOTHREAD APR was not given a thread structure * APR_ENOTHDKEY APR was not given a thread key structure * APR_ENOSHMAVAIL There is no more shared memory available - * APR_EDSOOPEN APR was unable to open the dso object. For more + * APR_EDSOOPEN APR was unable to open the dso object. For more * information call apr_dso_error(). * APR_EGENERAL General failure (specific information not available) * APR_EBADIP The specified IP address is invalid @@ -195,17 +195,17 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * APR_INCOMPLETE The operation was incomplete although some processing * was performed and the results are partially valid * APR_BADCH Getopt found an option not in the option string - * APR_BADARG Getopt found an option that is missing an argument + * APR_BADARG Getopt found an option that is missing an argument * and an argument was specified in the option string * APR_EOF APR has encountered the end of the file * APR_NOTFOUND APR was unable to find the socket in the poll structure * APR_ANONYMOUS APR is using anonymous shared memory * APR_FILEBASED APR is using a file name as the key to the shared memory * APR_KEYBASED APR is using a shared key as the key to the shared memory - * APR_EINIT Ininitalizer value. If no option has been found, but + * APR_EINIT Ininitalizer value. If no option has been found, but * the status variable requires a value, this should be used - * APR_ENOTIMPL The APR function has not been implemented on this - * platform, either because nobody has gotten to it yet, + * APR_ENOTIMPL The APR function has not been implemented on this + * platform, either because nobody has gotten to it yet, * or the function is impossible on this platform. * APR_EMISMATCH Two passwords do not match. * APR_EABSOLUTE The given path was absolute. @@ -273,7 +273,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_ENOTENOUGHENTROPY (APR_OS_START_ERROR + 28) /** @} */ -/** +/** * @defgroup APR_STATUS_IS Status Value Tests * @warning For any particular error condition, more than one of these tests * may match. This is because platform-specific error codes may not @@ -284,16 +284,16 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * adjust the order of the tests accordingly. * @{ */ -/** - * APR was unable to perform a stat on the file +/** + * APR was unable to perform a stat on the file * @warning always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_ENOSTAT(s) ((s) == APR_ENOSTAT) -/** - * APR was not provided a pool with which to allocate memory +/** + * APR was not provided a pool with which to allocate memory * @warning always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_ENOPOOL(s) ((s) == APR_ENOPOOL) /** APR was given an invalid date */ @@ -325,8 +325,8 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, /** The specified netmask is invalid */ #define APR_STATUS_IS_EBADMASK(s) ((s) == APR_EBADMASK) /* empty slot: +18 */ -/** - * APR was unable to open the dso object. +/** + * APR was unable to open the dso object. * For more information call apr_dso_error(). */ #if defined(WIN32) @@ -364,7 +364,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, /** @} */ -/** +/** * @addtogroup APR_Error * @{ */ @@ -405,7 +405,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, /** @see APR_STATUS_IS_KEYBASED */ #define APR_KEYBASED (APR_OS_START_STATUS + 21) /** @see APR_STATUS_IS_EINIT */ -#define APR_EINIT (APR_OS_START_STATUS + 22) +#define APR_EINIT (APR_OS_START_STATUS + 22) /** @see APR_STATUS_IS_ENOTIMPL */ #define APR_ENOTIMPL (APR_OS_START_STATUS + 23) /** @see APR_STATUS_IS_EMISMATCH */ @@ -414,156 +414,156 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_EBUSY (APR_OS_START_STATUS + 25) /** @} */ -/** +/** * @addtogroup APR_STATUS_IS * @{ */ -/** - * Program is currently executing in the child +/** + * Program is currently executing in the child * @warning * always use this test, as platform-specific variances may meet this * more than one error code */ #define APR_STATUS_IS_INCHILD(s) ((s) == APR_INCHILD) -/** - * Program is currently executing in the parent +/** + * Program is currently executing in the parent * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_INPARENT(s) ((s) == APR_INPARENT) -/** - * The thread is detached +/** + * The thread is detached * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_DETACH(s) ((s) == APR_DETACH) -/** - * The thread is not detached +/** + * The thread is not detached * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_NOTDETACH(s) ((s) == APR_NOTDETACH) -/** +/** * The child has finished executing * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_CHILD_DONE(s) ((s) == APR_CHILD_DONE) -/** +/** * The child has not finished executing * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_CHILD_NOTDONE(s) ((s) == APR_CHILD_NOTDONE) -/** +/** * The operation did not finish before the timeout * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP) -/** +/** * The operation was incomplete although some processing was performed * and the results are partially valid. * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_INCOMPLETE(s) ((s) == APR_INCOMPLETE) /* empty slot: +9 */ /* empty slot: +10 */ /* empty slot: +11 */ -/** +/** * Getopt found an option not in the option string * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_BADCH(s) ((s) == APR_BADCH) -/** - * Getopt found an option not in the option string and an argument was +/** + * Getopt found an option not in the option string and an argument was * specified in the option string * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_BADARG(s) ((s) == APR_BADARG) -/** +/** * APR has encountered the end of the file * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_EOF(s) ((s) == APR_EOF) -/** +/** * APR was unable to find the socket in the poll structure * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_NOTFOUND(s) ((s) == APR_NOTFOUND) /* empty slot: +16 */ /* empty slot: +17 */ /* empty slot: +18 */ -/** +/** * APR is using anonymous shared memory * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_ANONYMOUS(s) ((s) == APR_ANONYMOUS) -/** +/** * APR is using a file name as the key to the shared memory * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_FILEBASED(s) ((s) == APR_FILEBASED) -/** +/** * APR is using a shared key as the key to the shared memory * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_KEYBASED(s) ((s) == APR_KEYBASED) -/** - * Ininitalizer value. If no option has been found, but +/** + * Ininitalizer value. If no option has been found, but * the status variable requires a value, this should be used * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_EINIT(s) ((s) == APR_EINIT) -/** - * The APR function has not been implemented on this - * platform, either because nobody has gotten to it yet, +/** + * The APR function has not been implemented on this + * platform, either because nobody has gotten to it yet, * or the function is impossible on this platform. * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_ENOTIMPL(s) ((s) == APR_ENOTIMPL) -/** +/** * Two passwords do not match. * @warning * always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_EMISMATCH(s) ((s) == APR_EMISMATCH) -/** +/** * The given lock was busy * @warning always use this test, as platform-specific variances may meet this - * more than one error code + * more than one error code */ #define APR_STATUS_IS_EBUSY(s) ((s) == APR_EBUSY) /** @} */ -/** +/** * @addtogroup APR_Error APR Error Values * @{ */ @@ -652,8 +652,8 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_ESPIPE (APR_OS_START_CANONERR + 12) #endif -/** - * @see APR_STATUS_IS_EAGAIN +/** + * @see APR_STATUS_IS_EAGAIN * @warning use APR_STATUS_IS_EAGAIN instead of just testing this value */ #ifdef EAGAIN @@ -692,7 +692,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_EINPROGRESS (APR_OS_START_CANONERR + 17) #endif -/** +/** * @see APR_STATUS_IS_ECONNABORTED * @warning use APR_STATUS_IS_ECONNABORTED instead of just testing this value */ @@ -710,7 +710,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define APR_ECONNRESET (APR_OS_START_CANONERR + 19) #endif -/** @see APR_STATUS_IS_ETIMEDOUT +/** @see APR_STATUS_IS_ETIMEDOUT * @deprecated */ #ifdef ETIMEDOUT #define APR_ETIMEDOUT ETIMEDOUT @@ -788,7 +788,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, */ #define APR_OS2_STATUS(e) (APR_FROM_OS_ERROR(e)) -/* These can't sit in a private header, so in spite of the extra size, +/* These can't sit in a private header, so in spite of the extra size, * they need to be made available here. */ #define SOCBASEERR 10000 @@ -885,10 +885,10 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, || (s) == APR_OS_START_SYSERR + SOCECONNRESET) /* XXX deprecated */ #define APR_STATUS_IS_ETIMEDOUT(s) ((s) == APR_ETIMEDOUT \ - || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) + || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) #undef APR_STATUS_IS_TIMEUP #define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP \ - || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) + || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) #define APR_STATUS_IS_EHOSTUNREACH(s) ((s) == APR_EHOSTUNREACH \ || (s) == APR_OS_START_SYSERR + SOCEHOSTUNREACH) #define APR_STATUS_IS_ENETUNREACH(s) ((s) == APR_ENETUNREACH \ @@ -1121,7 +1121,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, #define apr_get_netos_error() (errno) #define apr_set_netos_error(e) (errno = (e)) -/** +/** * @addtogroup APR_STATUS_IS * @{ */ @@ -1185,15 +1185,15 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, /** operation now in progress */ #define APR_STATUS_IS_EINPROGRESS(s) ((s) == APR_EINPROGRESS) -/** - * Software caused connection abort +/** + * Software caused connection abort * @remark - * EPROTO on certain older kernels really means ECONNABORTED, so we need to + * EPROTO on certain older kernels really means ECONNABORTED, so we need to * ignore it for them. See discussion in new-httpd archives nh.9701 & nh.9603 * - * There is potentially a bug in Solaris 2.x x<6, and other boxes that + * There is potentially a bug in Solaris 2.x x<6, and other boxes that * implement tcp sockets in userland (i.e. on top of STREAMS). On these - * systems, EPROTO can actually result in a fatal loop. See PR#981 for + * systems, EPROTO can actually result in a fatal loop. See PR#981 for * example. It's hard to handle both uses of EPROTO. */ #ifdef EPROTO @@ -1236,14 +1236,14 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -1251,7 +1251,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_LICENSE_HEADER_END@ */ #include <sys/errno.h> @@ -1261,7 +1261,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -1270,10 +1270,10 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -1281,7 +1281,7 @@ APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ @@ -1484,7 +1484,7 @@ __END_DECLS #define ECANCELED 89 /* Operation canceled */ #define EIDRM 90 /* Identifier removed */ -#define ENOMSG 91 /* No message of desired type */ +#define ENOMSG 91 /* No message of desired type */ #define EILSEQ 92 /* Illegal byte sequence */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #define ENOATTR 93 /* Attribute not found */ diff --git a/doc/errno.list.solaris.txt b/doc/legacy/errno.list.solaris.txt index 23601e9d3..23601e9d3 100644 --- a/doc/errno.list.solaris.txt +++ b/doc/legacy/errno.list.solaris.txt diff --git a/doc/user-guide/fdl.texi b/doc/legacy/fdl.texi index 92cfa81a7..e33c687cd 100644 --- a/doc/user-guide/fdl.texi +++ b/doc/legacy/fdl.texi @@ -27,7 +27,7 @@ for modifications made by others. This License is a kind of ``copyleft'', which means that derivative works of the document must themselves be free in the same sense. It -complements the GNU Affero General Public License, which is a copyleft +complements the GNU General Public License, which is a copyleft license designed for free software. We have designed this License in order to use it for manuals for free @@ -445,7 +445,7 @@ situation. If your document contains nontrivial examples of program code, we recommend releasing these examples in parallel under your choice of -free software license, such as the GNU Affero General Public License, +free software license, such as the GNU General Public License, to permit their use in free software. @c Local Variables: diff --git a/doc/user-guide/fuse.odg b/doc/legacy/fuse.odg Binary files differindex 61bd103c7..61bd103c7 100644 --- a/doc/user-guide/fuse.odg +++ b/doc/legacy/fuse.odg diff --git a/doc/user-guide/fuse.pdf b/doc/legacy/fuse.pdf Binary files differindex a7d13faff..a7d13faff 100644 --- a/doc/user-guide/fuse.pdf +++ b/doc/legacy/fuse.pdf diff --git a/doc/get_put_api_using_xattr.txt b/doc/legacy/get_put_api_using_xattr.txt index 58951f5bf..243f9f1ae 100644 --- a/doc/get_put_api_using_xattr.txt +++ b/doc/legacy/get_put_api_using_xattr.txt @@ -16,7 +16,7 @@ internals: * posix handling setxattr/getxattr - setxattr posix setxattr does a open with O_CREAT|O_TRUNC on the <path>/<name>, writes value of the setxattr as data into the file and closes the file. when data is null, posix setxattr avoids doing write. file is closed after write. - + - getxattr posix getxattr does open with O_RDONLY on the <path>/<name>, reads the complete content of the file. file is closed after read. diff --git a/doc/user-guide/ha.odg b/doc/legacy/ha.odg Binary files differindex e4b8b72d0..e4b8b72d0 100644 --- a/doc/user-guide/ha.odg +++ b/doc/legacy/ha.odg diff --git a/doc/user-guide/ha.pdf b/doc/legacy/ha.pdf Binary files differindex e372c0ab0..e372c0ab0 100644 --- a/doc/user-guide/ha.pdf +++ b/doc/legacy/ha.pdf diff --git a/doc/hacker-guide/Makefile.am b/doc/legacy/hacker-guide/Makefile.am index 65c92ac23..65c92ac23 100644 --- a/doc/hacker-guide/Makefile.am +++ b/doc/legacy/hacker-guide/Makefile.am diff --git a/doc/hacker-guide/call-stub.txt b/doc/legacy/hacker-guide/call-stub.txt index bca1579b2..021037a35 100644 --- a/doc/hacker-guide/call-stub.txt +++ b/doc/legacy/hacker-guide/call-stub.txt @@ -2,8 +2,8 @@ creating a call stub and pausing a call --------------------------------------- libglusterfs provides seperate API to pause each of the fop. parameters to each API is @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). - NOTE: @fn should exactly take the same type and number of parameters that +@fn - procedure to call during call_resume(). + NOTE: @fn should exactly take the same type and number of parameters that the corresponding regular fop takes. rest will be the regular parameters to corresponding fop. @@ -17,7 +17,7 @@ specific parameters. here is the list of stub creation APIs for xlator fops. @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -30,7 +30,7 @@ fop_lookup_stub (call_frame_t *frame, int32_t need_xattr); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -41,7 +41,7 @@ fop_stat_stub (call_frame_t *frame, loc_t *loc); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to lk fop. NOTE: @fd is stored with a fd_ref(). call_stub_t * @@ -50,7 +50,7 @@ fop_fstat_stub (call_frame_t *frame, fd_t *fd); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be copied to a different location. @@ -62,7 +62,7 @@ fop_chmod_stub (call_frame_t *frame, mode_t mode); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to lk fop. NOTE: @fd is stored with a fd_ref(). @mode - mode parameter for fchmod fop. @@ -73,7 +73,7 @@ fop_fchmod_stub (call_frame_t *frame, mode_t mode); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be copied to a different location. @@ -87,7 +87,7 @@ fop_chown_stub (call_frame_t *frame, gid_t gid); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to lk fop. NOTE: @fd is stored with a fd_ref(). @uid - uid parameter to fchown. @@ -100,7 +100,7 @@ fop_fchown_stub (call_frame_t *frame, gid_t gid); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -113,7 +113,7 @@ fop_truncate_stub (call_frame_t *frame, off_t off); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to lk fop. NOTE: @fd is stored with a fd_ref(). @off - offset parameter to ftruncate fop. @@ -124,7 +124,7 @@ fop_ftruncate_stub (call_frame_t *frame, off_t off); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -137,7 +137,7 @@ fop_utimens_stub (call_frame_t *frame, struct timespec tv[2]); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -150,7 +150,7 @@ fop_access_stub (call_frame_t *frame, int32_t mask); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -163,7 +163,7 @@ fop_readlink_stub (call_frame_t *frame, size_t size); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -178,7 +178,7 @@ fop_mknod_stub (call_frame_t *frame, dev_t rdev); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -191,7 +191,7 @@ fop_mkdir_stub (call_frame_t *frame, mode_t mode); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -202,7 +202,7 @@ fop_unlink_stub (call_frame_t *frame, loc_t *loc); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -213,7 +213,7 @@ fop_rmdir_stub (call_frame_t *frame, loc_t *loc); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @linkname - linkname parameter to symlink fop. @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @@ -226,10 +226,10 @@ fop_symlink_stub (call_frame_t *frame, loc_t *loc); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @oldloc - pointer to location structure. - NOTE: @oldloc will be copied to a different location, with inode_ref() to - @oldloc->inode and @oldloc->parent, if not NULL. also @oldloc->path will + NOTE: @oldloc will be copied to a different location, with inode_ref() to + @oldloc->inode and @oldloc->parent, if not NULL. also @oldloc->path will be copied to a different location, if not NULL. @newloc - pointer to location structure. NOTE: @newloc will be copied to a different location, with inode_ref() to @@ -242,7 +242,7 @@ fop_rename_stub (call_frame_t *frame, loc_t *newloc); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -255,7 +255,7 @@ fop_link_stub (call_frame_t *frame, const char *newpath); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -272,7 +272,7 @@ fop_create_stub (call_frame_t *frame, mode_t mode, fd_t *fd); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @flags - flags parameter to open fop. @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @@ -286,7 +286,7 @@ fop_open_stub (call_frame_t *frame, fd_t *fd); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to lk fop. NOTE: @fd is stored with a fd_ref(). @size - size parameter to readv fop. @@ -299,10 +299,10 @@ fop_readv_stub (call_frame_t *frame, off_t off); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to lk fop. NOTE: @fd is stored with a fd_ref(). -@vector - vector parameter to writev fop. +@vector - vector parameter to writev fop. NOTE: @vector is iov_dup()ed while creating stub. and frame->root->req_refs dictionary is dict_ref()ed. @count - count parameter to writev fop. @@ -316,7 +316,7 @@ fop_writev_stub (call_frame_t *frame, off_t off); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to flush fop. NOTE: @fd is stored with a fd_ref(). call_stub_t * @@ -326,7 +326,7 @@ fop_flush_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to lk fop. NOTE: @fd is stored with a fd_ref(). @datasync - datasync parameter to fsync fop. @@ -337,7 +337,7 @@ fop_fsync_stub (call_frame_t *frame, int32_t datasync); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be copied to a different location. @@ -346,11 +346,11 @@ fop_fsync_stub (call_frame_t *frame, call_stub_t * fop_opendir_stub (call_frame_t *frame, fop_opendir_t fn, - loc_t *loc, + loc_t *loc, fd_t *fd); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to getdents fop. NOTE: @fd is stored with a fd_ref(). @size - size parameter to getdents fop. @@ -365,7 +365,7 @@ fop_getdents_stub (call_frame_t *frame, int32_t flag); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to setdents fop. NOTE: @fd is stored with a fd_ref(). @flags - flags parameter to setdents fop. @@ -379,7 +379,7 @@ fop_setdents_stub (call_frame_t *frame, int32_t count); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to setdents fop. NOTE: @fd is stored with a fd_ref(). @datasync - datasync parameter to fsyncdir fop. @@ -390,7 +390,7 @@ fop_fsyncdir_stub (call_frame_t *frame, int32_t datasync); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -401,9 +401,9 @@ fop_statfs_stub (call_frame_t *frame, loc_t *loc); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. - NOTE: @loc will be copied to a different location, with inode_ref() to + NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be copied to a different location. @dict - dict parameter to setxattr fop. @@ -416,7 +416,7 @@ fop_setxattr_stub (call_frame_t *frame, int32_t flags); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -429,7 +429,7 @@ fop_getxattr_stub (call_frame_t *frame, const char *name); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -443,7 +443,7 @@ fop_removexattr_stub (call_frame_t *frame, const char *name); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to lk fop. NOTE: @fd is stored with a fd_ref(). @cmd - command parameter to lk fop. @@ -457,13 +457,13 @@ fop_lk_stub (call_frame_t *frame, struct flock *lock); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - fd parameter to gf_lk fop. NOTE: @fd is fd_ref()ed while creating stub, if not NULL. @cmd - cmd parameter to gf_lk fop. @lock - lock paramater to gf_lk fop. NOTE: @lock is copied to a different memory location while creating - stub. + stub. call_stub_t * fop_gf_lk_stub (call_frame_t *frame, fop_gf_lk_t fn, @@ -472,7 +472,7 @@ fop_gf_lk_stub (call_frame_t *frame, struct flock *lock); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @fd - file descriptor parameter to readdir fop. NOTE: @fd is stored with a fd_ref(). @size - size parameter to readdir fop. @@ -485,7 +485,7 @@ fop_readdir_stub (call_frame_t *frame, off_t off); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @loc - pointer to location structure. NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and @loc->parent, if not NULL. also @loc->path will be @@ -498,7 +498,7 @@ fop_checksum_stub (call_frame_t *frame, int32_t flags); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @inode - inode parameter to @fn. @@ -516,7 +516,7 @@ fop_lookup_cbk_stub (call_frame_t *frame, struct stat *buf, dict_t *dict); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @buf - buf parameter to @fn. @@ -529,7 +529,7 @@ fop_stat_cbk_stub (call_frame_t *frame, struct stat *buf); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @buf - buf parameter to @fn. @@ -542,7 +542,7 @@ fop_fstat_cbk_stub (call_frame_t *frame, struct stat *buf); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @buf - buf parameter to @fn. @@ -555,7 +555,7 @@ fop_chmod_cbk_stub (call_frame_t *frame, struct stat *buf); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @buf - buf parameter to @fn. @@ -568,7 +568,7 @@ fop_fchmod_cbk_stub (call_frame_t *frame, struct stat *buf); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @buf - buf parameter to @fn. @@ -582,7 +582,7 @@ fop_chown_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @buf - buf parameter to @fn. @@ -596,7 +596,7 @@ fop_fchown_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @buf - buf parameter to @fn. @@ -610,7 +610,7 @@ fop_truncate_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @buf - buf parameter to @fn. @@ -624,7 +624,7 @@ fop_ftruncate_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @buf - buf parameter to @fn. @@ -638,7 +638,7 @@ fop_utimens_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. call_stub_t * @@ -649,7 +649,7 @@ fop_access_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @path - path parameter to @fn. @@ -663,7 +663,7 @@ fop_readlink_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @inode - inode parameter to @fn. @@ -680,7 +680,7 @@ fop_mknod_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @inode - inode parameter to @fn. @@ -697,7 +697,7 @@ fop_mkdir_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. call_stub_t * @@ -708,7 +708,7 @@ fop_unlink_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. call_stub_t * @@ -719,7 +719,7 @@ fop_rmdir_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @inode - inode parameter to @fn. @@ -736,7 +736,7 @@ fop_symlink_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @buf - buf parameter to @fn. @@ -750,7 +750,7 @@ fop_rename_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @inode - inode parameter to @fn. @@ -766,7 +766,7 @@ fop_link_cbk_stub (call_frame_t *frame, struct stat *buf); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @fd - fd parameter to @fn. @@ -786,7 +786,7 @@ fop_create_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @fd - fd parameter to @fn. @@ -800,10 +800,10 @@ fop_open_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. -@vector - vector parameter to @fn. +@vector - vector parameter to @fn. NOTE: @vector is copied to a different memory location, if not NULL. also frame->root->rsp_refs is dict_ref()ed. @stbuf - stbuf parameter to @fn. @@ -819,7 +819,7 @@ fop_readv_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @stbuf - stbuf parameter to @fn. @@ -833,7 +833,7 @@ fop_writev_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. call_stub_t * @@ -843,7 +843,7 @@ fop_flush_cbk_stub (call_frame_t *frame, int32_t op_errno); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. call_stub_t * @@ -854,7 +854,7 @@ fop_fsync_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @fd - fd parameter to @fn. @@ -868,7 +868,7 @@ fop_opendir_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @entries - entries parameter to @fn. @@ -883,7 +883,7 @@ fop_getdents_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. call_stub_t * @@ -893,7 +893,7 @@ fop_setdents_cbk_stub (call_frame_t *frame, int32_t op_errno); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. call_stub_t * @@ -904,7 +904,7 @@ fop_fsyncdir_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @buf - buf parameter to @fn. @@ -918,7 +918,7 @@ fop_statfs_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. call_stub_t * @@ -929,7 +929,7 @@ fop_setxattr_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @value - value dictionary parameter to @fn. @@ -943,7 +943,7 @@ fop_getxattr_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. call_stub_t * @@ -954,12 +954,12 @@ fop_removexattr_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @lock - lock parameter to @fn. NOTE: @lock is copied to a different memory location while creating - stub. + stub. call_stub_t * fop_lk_cbk_stub (call_frame_t *frame, fop_lk_cbk_t fn, @@ -968,12 +968,12 @@ fop_lk_cbk_stub (call_frame_t *frame, struct flock *lock); @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @lock - lock parameter to @fn. NOTE: @lock is copied to a different memory location while creating - stub. + stub. call_stub_t * fop_gf_lk_cbk_stub (call_frame_t *frame, fop_gf_lk_cbk_t fn, @@ -983,7 +983,7 @@ fop_gf_lk_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @entries - entries parameter to @fn. @@ -996,14 +996,14 @@ fop_readdir_cbk_stub (call_frame_t *frame, @frame - call frame which has to be used to resume the call at call_resume(). -@fn - procedure to call during call_resume(). +@fn - procedure to call during call_resume(). @op_ret - op_ret parameter to @fn. @op_errno - op_errno parameter to @fn. @file_checksum - file_checksum parameter to @fn. - NOTE: file_checksum will be copied to a different memory location + NOTE: file_checksum will be copied to a different memory location while creating stub. @dir_checksum - dir_checksum parameter to @fn. - NOTE: file_checksum will be copied to a different memory location + NOTE: file_checksum will be copied to a different memory location while creating stub. call_stub_t * fop_checksum_cbk_stub (call_frame_t *frame, @@ -1025,9 +1025,9 @@ resuming a call: in stub->args.<operation>.<fd_t-or-inode_t-or-dict_t>. so, if any fd_t, dict_t or inode_t pointers are assigned at stub->args.<operation>.<fd_t-or-inode_t-or-dict_t> after fop_<operation>_stub() call, they must be <fd_t-or-inode_t-or-dict_t>_ref()ed. - + call_resume does not STACK_DESTROY() for any fop. - + if stub->fn is NULL, call_resume does STACK_WIND() or STACK_UNWIND() using the stub->frame. return - call resume fails only if stub is NULL. call resume fails with errno set to EINVAL. diff --git a/doc/hacker-guide/hacker-guide.tex b/doc/legacy/hacker-guide/hacker-guide.tex index c2d7255d7..11101e7a8 100644 --- a/doc/hacker-guide/hacker-guide.tex +++ b/doc/legacy/hacker-guide/hacker-guide.tex @@ -31,7 +31,7 @@ most part. \chapter{Major components} \section{libglusterfs} -\texttt{libglusterfs} contains supporting code used by all the other components. +\texttt{libglusterfs} contains supporting code used by all the other components. The important files here are: \texttt{dict.c}: This is an implementation of a serializable dictionary type. It is @@ -165,8 +165,8 @@ First we include the requisite headers. #include "logging.h" /* - * This is a rot13 ``encryption'' xlator. It rot13's data when - * writing to disk and rot13's it back when reading it. + * This is a rot13 ``encryption'' xlator. It rot13's data when + * writing to disk and rot13's it back when reading it. * This xlator is meant as an example, not for production * use ;) (hence no error-checking) */ @@ -178,7 +178,7 @@ letters. Any other byte is passed through as it is. \begin{verbatim} /* We only handle lower case letters for simplicity */ -static void +static void rot13 (char *buf, int len) { int i; @@ -252,12 +252,12 @@ rot13_writev (call_frame_t *frame, xlator_t *this, dict_t *ctx, struct iovec *vector, - int32_t count, + int32_t count, off_t offset) { rot13_iovec (vector, count); - STACK_WIND (frame, + STACK_WIND (frame, rot13_writev_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, @@ -267,9 +267,9 @@ rot13_writev (call_frame_t *frame, \end{verbatim} -Every xlator must define two functions and two external symbols. The functions are +Every xlator must define two functions and two external symbols. The functions are \texttt{init} and \texttt{fini}, and the symbols are \texttt{fops} and \texttt{mops}. -The \texttt{init} function is called when the xlator is loaded by GlusterFS, and +The \texttt{init} function is called when the xlator is loaded by GlusterFS, and contains code for the xlator to initialize itself. Note that if an xlator is present multiple times in the spec tree, the \texttt{init} function will be called each time the xlator is loaded. @@ -279,7 +279,7 @@ int32_t init (xlator_t *this) { if (!this->children) { - gf_log ("rot13", GF_LOG_ERROR, + gf_log ("rot13", GF_LOG_ERROR, "FATAL: rot13 should have exactly one child"); return -1; } @@ -291,7 +291,7 @@ init (xlator_t *this) \begin{verbatim} -void +void fini (xlator_t *this) { return; diff --git a/doc/hacker-guide/replicate.txt b/doc/legacy/hacker-guide/replicate.txt index fd1ef2747..133c72afa 100644 --- a/doc/hacker-guide/replicate.txt +++ b/doc/legacy/hacker-guide/replicate.txt @@ -5,7 +5,7 @@ Before understanding replicate, one must understand two internal FOPs: GF_FILE_LK: - This is exactly like fcntl(2) locking, except the locks are in a + This is exactly like fcntl(2) locking, except the locks are in a separate domain from locks held by applications. GF_DIR_LK (loc_t *loc, char *basename): @@ -17,7 +17,7 @@ GF_DIR_LK (loc_t *loc, char *basename): If one wishes to lock *all* the names under a particular directory, supply the basename argument as NULL. - The locks can either be read locks or write locks; consult the + The locks can either be read locks or write locks; consult the function prototype for more details. Both these operations are implemented by the features/locks (earlier @@ -79,7 +79,7 @@ Each of the four major groups has its own algorithm: All operations are done in parallel unless specified otherwise. - (1) Send a GF_FILE_LK request on all children for a write lock on + (1) Send a GF_FILE_LK request on all children for a write lock on the appropriate region (for metadata operations: entire file (0, 0) for writev: (offset, offset+size of buffer)) @@ -87,11 +87,11 @@ Each of the four major groups has its own algorithm: - If a lock request fails on a child: unlock all children try to acquire a blocking lock (F_SETLKW) on each child, serially. - + If this fails (due to ENOTCONN or EINVAL): Consider this child as dead for rest of transaction. - (2) Mark all children as "pending" on all (alive) children + (2) Mark all children as "pending" on all (alive) children (see below for meaning of "pending"). - If it fails on any child: @@ -105,7 +105,7 @@ Each of the four major groups has its own algorithm: (4) Unmark all successful children as not "pending" on all nodes. (5) Unlock region on all (alive) children. - + ----------- - dir-write ----------- @@ -121,11 +121,11 @@ Each of the four major groups has its own algorithm: The "pending" number is like a journal entry. A pending entry is an array of 32-bit integers stored in network byte-order as the extended attribute of an inode (which can be a directory as well). - + There are three keys corresponding to three types of pending operations: - AFR_METADATA_PENDING - There are some metadata operations pending on this inode (perms, ctime/mtime, + There are some metadata operations pending on this inode (perms, ctime/mtime, xattr, etc.). - AFR_DATA_PENDING @@ -134,7 +134,7 @@ Each of the four major groups has its own algorithm: - AFR_ENTRY_PENDING There are some directory operations pending on this directory (create, unlink, etc.). - + ----------- * Self heal ----------- @@ -155,7 +155,7 @@ Each of the four major groups has its own algorithm: other is directory): - Announce to the user via log that a split-brain situation has been detected, and do nothing. - + - On open, gather extended attribute data: - Consider the file with the highest AFR_DATA_PENDING number as the definitive one and replicate its contents on all other @@ -190,7 +190,7 @@ Thus, if lookup on c1 returns an inode number "2", it is scaled to "4" This way we ensure that there is never a collision of inode numbers from two different children. -This reduction of inode space doesn't really reduce the usability of +This reduction of inode space doesn't really reduce the usability of replicate since even if we assume replicate has 1024 children (which would be a highly unusual scenario), each child still has a 54-bit inode space. diff --git a/doc/handling-options.txt b/doc/legacy/handling-options.txt index cac1fe939..9a3b2510a 100644 --- a/doc/handling-options.txt +++ b/doc/legacy/handling-options.txt @@ -2,12 +2,12 @@ How to add a new option to a given volume ? =========================================== -* Add a entry in 'struct volume_options options[]' with your key, what is +* Add a entry in 'struct volume_options options[]' with your key, what is the type of the 'key', etc. -* The 'key' and corresponding 'value' given for the same by user are validated +* The 'key' and corresponding 'value' given for the same by user are validated before calling init() of the translator/transport/scheduler/auth-module. -* Once the complete init() is successful, user will get a warning if he has +* Once the complete init() is successful, user will get a warning if he has given a 'key' which is not defined in these modules. diff --git a/doc/mac-related-xattrs.txt b/doc/legacy/mac-related-xattrs.txt index 805658334..92bb2ceef 100644 --- a/doc/mac-related-xattrs.txt +++ b/doc/legacy/mac-related-xattrs.txt @@ -1,21 +1,21 @@ -This document is intended to briefly explain how the Extended Attributes on +This document is intended to briefly explain how the Extended Attributes on Darwin 10.5.x releases works ---- -On Darwin other than all the normal filesystem operations, 'Finder' (like -Explorer in Windows but a little more) keeps its information in two extended -attributes named 'com.apple.FinderInfo' and 'com.apple.ResourceFork'. If these -xattrs are not implemented the filesystem won't be shown on Finder, and if they -are not implemented properly there may be issues when some of the file operations -are done through GUI of Finder. But when a filesystem is used over mountpoint in a -terminal, everything is fine and these xattrs are not required. +On Darwin other than all the normal filesystem operations, 'Finder' (like +Explorer in Windows but a little more) keeps its information in two extended +attributes named 'com.apple.FinderInfo' and 'com.apple.ResourceFork'. If these +xattrs are not implemented the filesystem won't be shown on Finder, and if they +are not implemented properly there may be issues when some of the file operations +are done through GUI of Finder. But when a filesystem is used over mountpoint in a +terminal, everything is fine and these xattrs are not required. -Currently the way these xattrs are implemented is simple. All the xattr calls +Currently the way these xattrs are implemented is simple. All the xattr calls (getxattr, setxattr, listxattr, removexattr) are passed down to underlaying filesystem, most of the cases when exported FS is on MacOS X itself, these keys are supported, hence -the fops succeed. But in the case of using exports of different OS on Darwin the issue is -extended attribute prefix like 'com.apple.' may not be supported, hence the problem with +the fops succeed. But in the case of using exports of different OS on Darwin the issue is +extended attribute prefix like 'com.apple.' may not be supported, hence the problem with Finder. To solve this issue, GlusterFS returns virtual default values to these keys, which works fine on most of the cases. diff --git a/doc/porting_guide.txt b/doc/legacy/porting_guide.txt index 905bb4228..5705cd964 100644 --- a/doc/porting_guide.txt +++ b/doc/legacy/porting_guide.txt @@ -3,7 +3,7 @@ * General setup -The configure script will detect the target platform for the build. +The configure script will detect the target platform for the build. All platform-specific CFLAGS, macro definitions should be done in configure.ac @@ -15,11 +15,11 @@ Platform-specific code can be written like this: * Coding guidelines -In general, avoid glibc extensions. For example, nested functions don't work +In general, avoid glibc extensions. For example, nested functions don't work on Mac OS X. It is best to stick to C99. When using library calls and system calls, pay attention to the -portability notes. As far as possible stick to POSIX-specified behavior. +portability notes. As far as possible stick to POSIX-specified behavior. Do not use anything expressly permitted by the specification. For example, some fields in structures may be present only on certain platforms. Avoid use of such things. @@ -27,14 +27,14 @@ use of such things. Do not pass values of constants such as F_*, O_*, errno values, etc. across platforms. -Please refer compat-errno.h for more details about errno handling inside -glusterfs for cross platform. +Please refer compat-errno.h for more details about errno handling inside +glusterfs for cross platform. * Specific issues - The argp library is available only on Linux through glibc, but for other platforms glusterfs has already included argp-standalone library which will - statically linked during the glusterfs build. + statically linked during the glusterfs build. - Extended attribute calls (setxattr, listxattr, etc.) have differing prototypes on different platforms. See compat.h for macro definitions to resolve this, also diff --git a/doc/replicate.lyx b/doc/legacy/replicate.lyx index d11a92bee..58ba6b2e0 100644 --- a/doc/replicate.lyx +++ b/doc/legacy/replicate.lyx @@ -36,7 +36,7 @@ Automatic File Replication (replicate) in GlusterFS \end_layout \begin_layout Author -Vikas Gorur +Vikas Gorur \family typewriter \size larger <vikas@gluster.com> @@ -77,7 +77,7 @@ The replicate translator of GlusterFS aims to keep identical copies of a file \end_layout \begin_layout Standard -In the rest of the document the terms +In the rest of the document the terms \begin_inset Quotes eld \end_inset @@ -85,7 +85,7 @@ subvolume \begin_inset Quotes erd \end_inset - and + and \begin_inset Quotes eld \end_inset @@ -167,7 +167,7 @@ end{verbatim} \begin_layout Standard This defines an replicate volume with two subvolumes, brick1, and brick2. - For replicate to work properly, it is essential that its subvolumes support + For replicate to work properly, it is essential that its subvolumes support \series bold extended attributes \series default @@ -177,7 +177,7 @@ extended attributes \end_layout \begin_layout Standard -The storage volumes used as backend for replicate +The storage volumes used as backend for replicate \emph on must \emph default @@ -262,7 +262,7 @@ replicate divides all filesystem write operations into three classes: \begin_layout Itemize \series bold -data: +data: \series default Operations that modify the contents of a file (write, truncate). \end_layout @@ -270,7 +270,7 @@ Operations that modify the contents of a file (write, truncate). \begin_layout Itemize \series bold -metadata: +metadata: \series default Operations that modify attributes of a file or directory (permissions, ownership , etc.). @@ -279,7 +279,7 @@ Operations that modify attributes of a file or directory (permissions, ownership \begin_layout Itemize \series bold -entry: +entry: \series default Operations that create or delete directory entries (mkdir, create, rename, rmdir, unlink, etc.). @@ -345,7 +345,7 @@ Self-Heal \begin_layout Standard replicate automatically tries to fix any inconsistencies it detects among different copies of a file. - It uses information in the change log to determine which copy is the + It uses information in the change log to determine which copy is the \begin_inset Quotes eld \end_inset @@ -357,7 +357,7 @@ correct \end_layout \begin_layout Standard -Self-heal is triggered when a file or directory is first +Self-heal is triggered when a file or directory is first \begin_inset Quotes eld \end_inset @@ -374,7 +374,7 @@ If the entry being accessed is a directory: \end_layout \begin_layout Itemize -The contents of the +The contents of the \begin_inset Quotes eld \end_inset @@ -412,7 +412,7 @@ It may happen that one replicate client can access only some of the servers in a cluster and another replicate client can access the remaining servers. Or it may happen that in a cluster of two servers, one server goes down and comes back up, but the other goes down immediately. - Both these scenarios result in a + Both these scenarios result in a \begin_inset Quotes eld \end_inset @@ -425,7 +425,7 @@ split-brain \begin_layout Standard In a split-brain situation, there will be two or more copies of a file, - all of which are + all of which are \begin_inset Quotes eld \end_inset @@ -484,12 +484,12 @@ split-brain ). This means if a discrepancy is noticed in the attributes or content of a file, the copy on the `favorite-child' will be considered the definitive - version and its contents will + version and its contents will \emph on -overwrite +overwrite \emph default the contents of all other copies. - Use this option with caution! It is possible to + Use this option with caution! It is possible to \emph on lose data \emph default @@ -502,7 +502,7 @@ Self-heal options \end_layout \begin_layout Standard -Setting any of these options to +Setting any of these options to \begin_inset Quotes eld \end_inset @@ -549,7 +549,7 @@ If any of these options is turned off, it disables writing of change log entries for that class of file operations. That is, steps 2 and 4 of the write algorithm (see above) are not done. Note that if the change log is not written, the self-heal algorithm cannot - determine the + determine the \begin_inset Quotes eld \end_inset @@ -557,7 +557,7 @@ correct \begin_inset Quotes erd \end_inset - version of a file and hence self-heal will only be able to fix + version of a file and hence self-heal will only be able to fix \begin_inset Quotes eld \end_inset @@ -602,7 +602,7 @@ These options let you specify the number of lock servers to use for each The default values are satisfactory in most cases. If you are extra paranoid, you may want to increase the values. However, be very cautious if you set the data- or entry- lock server counts - to zero, since this can result in + to zero, since this can result in \emph on lost data. @@ -610,11 +610,11 @@ lost data. For example, if you set the data-lock-server-count to zero, and two application s write to the same region of a file, there is a possibility that none of your servers will have all the data. - In other words, the copies will be + In other words, the copies will be \emph on inconsistent \emph default -, and +, and \emph on incomplete \emph default diff --git a/doc/replicate.pdf b/doc/legacy/replicate.pdf Binary files differindex b7212af2b..b7212af2b 100644 --- a/doc/replicate.pdf +++ b/doc/legacy/replicate.pdf diff --git a/doc/solaris-related-xattrs.txt b/doc/legacy/solaris-related-xattrs.txt index e26efa5d1..3a4643948 100644 --- a/doc/solaris-related-xattrs.txt +++ b/doc/legacy/solaris-related-xattrs.txt @@ -1,42 +1,42 @@ Solaris Extended Attributes In solaris extended attributes are logically supported as files -within the filesystem. The file system is therefore augmented +within the filesystem. The file system is therefore augmented with an orthogonal namespace of file attributes. Attribute values are accessed by file descriptors obtained through a special attribute -interface. This type of logical view of "attributes as files" allows -the leveraging of existing file system interface functionality to -support the construction, deletion and manipulation of attributes. +interface. This type of logical view of "attributes as files" allows +the leveraging of existing file system interface functionality to +support the construction, deletion and manipulation of attributes. But as we have tested through this functionality provided by Solaris we have come accross two major issues as written below. -1. Symlink XATTR_NOFOLLOW not present for creating extended attributes +1. Symlink XATTR_NOFOLLOW not present for creating extended attributes directly on the symlinks like other platforms Linux,MAC-OSX,BSD etc. - An implementation is present for O_NOFOLLOW for "openat()" call sets - up errno ELOOP whenever encountered with a symlink and also another + An implementation is present for O_NOFOLLOW for "openat()" call sets + up errno ELOOP whenever encountered with a symlink and also another implementation AT_SYMLINK_NOFOLLOW which is not present for calls like "attropen(), openat()" a snippet of test code which helped us understand this behaviour -------------------------------------- - attrfd = attropen (path, key, + attrfd = attropen (path, key, flags|AT_SYMLINK_NOFOLLOW|O_CREAT|O_WRONLY|O_NOFOLLOW, 0777); if (attrfd >= 0) { ftruncate (attrfd, 0); ret = write (attrfd, value, size); close (attrfd); } else { - fprintf (stderr, "Couldn't set extended attribute for %s (%d)\n", + fprintf (stderr, "Couldn't set extended attribute for %s (%d)\n", path, errno); - } + } -------------------------------------- 2. Extended attribute support for special files like device files, fifo files - is not supported under solaris. + is not supported under solaris. Apart from these glitches almost everything regarding porting functionality -for extended attribute calls has been properly implemented in compat.c +for extended attribute calls has been properly implemented in compat.c with writing wrapper around functions over "attropen()", "openat()", "unlinkat()" diff --git a/doc/stat-prefetch-design.txt b/doc/legacy/stat-prefetch-design.txt index 06d0ad37e..68ed423d3 100644 --- a/doc/stat-prefetch-design.txt +++ b/doc/legacy/stat-prefetch-design.txt @@ -1,64 +1,64 @@ what is stat-prefetch? ====================== It is a translator which caches the dentries read in readdir. This dentry -list is stored in the context of fd. Later when lookup happens on +list is stored in the context of fd. Later when lookup happens on [parent-inode, basename (path)] combination, this list is searched for the basename. The dentry thus searched is used to fill up the stat corresponding to path being looked upon, thereby short-cutting lookup calls. This cache is -preserved till closedir is called on the fd. The purpose of this translator -is to optimize operations like 'ls -l', where a readdir is followed by +preserved till closedir is called on the fd. The purpose of this translator +is to optimize operations like 'ls -l', where a readdir is followed by lookup (stat) calls on each directory entry. -1. stat-prefetch harnesses the efficiency of short lookup calls - (saves network roundtrip time for lookup calls from being accounted to +1. stat-prefetch harnesses the efficiency of short lookup calls + (saves network roundtrip time for lookup calls from being accounted to the stat call). -2. To maintain the correctness, it does lookup-behind - lookup is winded to - underlying translators after it is unwound to upper translators. +2. To maintain the correctness, it does lookup-behind - lookup is winded to + underlying translators after it is unwound to upper translators. lookup-behind is necessary as inode gets populated in server inode table - only in lookup-cbk and also because various translators store their + only in lookup-cbk and also because various translators store their contexts in inode contexts during lookup calls. fops to be implemented: ======================= * lookup - 1. check the dentry cache stored in context of fds opened by the same process + 1. check the dentry cache stored in context of fds opened by the same process on parent inode for basename. If found unwind with cached stat, else wind - the lookup call to underlying translators. - 2. stat is stored in the context of inode if the path being looked upon + the lookup call to underlying translators. + 2. stat is stored in the context of inode if the path being looked upon happens to be directory. This stat will be used to fill postparent stat when lookup happens on any of the directory contents. * readdir 1. cache the direntries returned in readdir_cbk in the context of fd. - 2. if the readdir is happening on non-expected offsets (means a seekdir/rewinddir + 2. if the readdir is happening on non-expected offsets (means a seekdir/rewinddir has happened), cache has to be flushed. - 3. delete the entry corresponding to basename of path on which fd is opened + 3. delete the entry corresponding to basename of path on which fd is opened from cache stored in parent. * chmod/fchmod delete the entry corresponding to basename from cache stored in context of - fds opened on parent inode, since these calls change st_mode and st_ctime of + fds opened on parent inode, since these calls change st_mode and st_ctime of stat. - + * chown/fchown - delete the entry corresponding to basename from cache stored in context of - fds opened on parent inode, since these calls change st_uid/st_gid and + delete the entry corresponding to basename from cache stored in context of + fds opened on parent inode, since these calls change st_uid/st_gid and st_ctime of stat. * truncate/ftruncate - delete the entry corresponding to basename from cache stored in context of + delete the entry corresponding to basename from cache stored in context of fds opened on parent inode, since these calls change st_size/st_mtime of stat. * utimens - delete the entry corresponding to basename from cache stored in context of + delete the entry corresponding to basename from cache stored in context of fds opened on parent inode, since this call changes st_atime/st_mtime of stat. * readlink delete the entry corresponding to basename from cache stored in context of fds opened on parent inode, since this call changes st_atime of stat. - + * unlink - 1. delete the entry corresponding to basename from cache stored in context of + 1. delete the entry corresponding to basename from cache stored in context of fds, opened on parent directory containing the file being unlinked. 2. delete the entry corresponding to basename of parent directory from cache of grand-parent. @@ -66,14 +66,14 @@ fops to be implemented: * rmdir 1. delete the entry corresponding to basename from cache stored in context of fds opened on parent inode. - 2. remove the entire cache from all fds opened on inode corresponding to + 2. remove the entire cache from all fds opened on inode corresponding to directory being removed. 3. delete the entry correspondig to basename of parent from cache stored in grand-parent. * readv delete the entry corresponding to basename from cache stored in context of fds - opened on parent inode, since readv changes st_atime of file. + opened on parent inode, since readv changes st_atime of file. * writev delete the entry corresponding to basename from cache stored in context of fds @@ -82,29 +82,29 @@ fops to be implemented: * fsync there is a confusion here as to whether fsync updates mtime/ctimes. Disk based - filesystems (atleast ext2) just writes the times stored in inode to disk - during fsync and not the time at which fsync is being done. But in glusterfs, - a translator like write-behind actually sends writes during fsync which will - change mtime/ctime. Hence stat-prefetch implements fsync to delete the entry + filesystems (atleast ext2) just writes the times stored in inode to disk + during fsync and not the time at which fsync is being done. But in glusterfs, + a translator like write-behind actually sends writes during fsync which will + change mtime/ctime. Hence stat-prefetch implements fsync to delete the entry corresponding to basename from cache stored in context of fds opened on parent inode. - + * rename - 1. remove entry corresponding to oldname from cache stored in fd contexts of + 1. remove entry corresponding to oldname from cache stored in fd contexts of oldparent. 2. remove entry corresponding to newname from cache stored in fd contexts of - newparent. - 3. remove entry corresponding to oldparent from cache stored in + newparent. + 3. remove entry corresponding to oldparent from cache stored in old-grand-parent, since removing oldname changes st_mtime and st_ctime of oldparent stat. - 4. remove entry corresponding to newparent from cache stored in + 4. remove entry corresponding to newparent from cache stored in new-grand-parent, since adding newname changes st_mtime and st_ctime of newparent stat. - 5. if oldname happens to be a directory, remove entire cache from all fds + 5. if oldname happens to be a directory, remove entire cache from all fds opened on it. * create/mknod/mkdir/symlink/link - delete entry corresponding to basename of parent directory in which these + delete entry corresponding to basename of parent directory in which these operations are happening, from cache stored in context of fds opened on grand-parent, since adding a new entry to a directory changes st_mtime and st_ctime of parent directory. @@ -116,13 +116,13 @@ fops to be implemented: * setdents 1. remove entry corresponding to basename of path on which fd is opened from cache stored in context of fds opened on parent. - 2. for each of the entry in the direntry list, delete from cache stored in + 2. for each of the entry in the direntry list, delete from cache stored in context of fd, the entry corresponding to basename of path being passed. * getdents 1. remove entry corresponding to basename of path on which fd is opened from - cache stored in parent, since getdents changes st_atime. - 2. remove entries corresponding to symbolic links from cache, since readlink + cache stored in parent, since getdents changes st_atime. + 2. remove entries corresponding to symbolic links from cache, since readlink would've changed st_atime. * checksum @@ -144,11 +144,11 @@ callbacks to be implemented: limitations: ============ * since a readdir does not return extended attributes of file, if need_xattr is - set, short-cutting of lookup does not happen and lookup is passed to + set, short-cutting of lookup does not happen and lookup is passed to underlying translators. * posix_readdir does not check whether the dentries are spanning across multiple - mount points. Hence it is not transforming inode numbers in stat buffers if + mount points. Hence it is not transforming inode numbers in stat buffers if posix is configured to allow export directory spanning on multiple mountpoints. - This is a bug which needs to be fixed. posix_readdir should treat dentries the + This is a bug which needs to be fixed. posix_readdir should treat dentries the same way as if lookup is happening on dentries. diff --git a/doc/user-guide/stripe.odg b/doc/legacy/stripe.odg Binary files differindex 79441bf14..79441bf14 100644 --- a/doc/user-guide/stripe.odg +++ b/doc/legacy/stripe.odg diff --git a/doc/user-guide/stripe.pdf b/doc/legacy/stripe.pdf Binary files differindex b94446feb..b94446feb 100644 --- a/doc/user-guide/stripe.pdf +++ b/doc/legacy/stripe.pdf diff --git a/doc/translator-options.txt b/doc/legacy/translator-options.txt index 278ef5b00..3422c058a 100644 --- a/doc/translator-options.txt +++ b/doc/legacy/translator-options.txt @@ -1,7 +1,7 @@ mount/fuse: * direct-io-mode GF_OPTION_TYPE_BOOL on|off|yes|no * mount-point (mountpoint) GF_OPTION_TYPE_PATH <any-posix-valid-path> - * attribute-timeout GF_OPTION_TYPE_DOUBLE 0.0 + * attribute-timeout GF_OPTION_TYPE_DOUBLE 0.0 * entry-timeout GF_OPTION_TYPE_DOUBLE 0.0 protocol/server: @@ -13,20 +13,20 @@ protocol/server: protocol/client: * username GF_OPTION_TYPE_ANY - * password GF_OPTION_TYPE_ANY + * password GF_OPTION_TYPE_ANY * transport-type GF_OPTION_TYPE_STR tcp|socket|ib-verbs|unix|ib-sdp| tcp/client|ib-verbs/client - * remote-host GF_OPTION_TYPE_ANY - * remote-subvolume GF_OPTION_TYPE_ANY - * transport-timeout GF_OPTION_TYPE_TIME 5-1013 + * remote-host GF_OPTION_TYPE_ANY + * remote-subvolume GF_OPTION_TYPE_ANY + * transport-timeout GF_OPTION_TYPE_TIME 5-1013 cluster/replicate: * read-subvolume GF_OPTION_TYPE_XLATOR * favorite-child GF_OPTION_TYPE_XLATOR - * data-self-heal GF_OPTION_TYPE_BOOL + * data-self-heal GF_OPTION_TYPE_BOOL * metadata-self-heal GF_OPTION_TYPE_BOOL - * entry-self-heal GF_OPTION_TYPE_BOOL - * data-change-log GF_OPTION_TYPE_BOOL + * entry-self-heal GF_OPTION_TYPE_BOOL + * data-change-log GF_OPTION_TYPE_BOOL * metadata-change-log GF_OPTION_TYPE_BOOL * entry-change-log GF_OPTION_TYPE_BOOL * data-lock-server-count GF_OPTION_TYPE_INT 0 @@ -34,54 +34,54 @@ cluster/replicate: * entry-lock-server-count GF_OPTION_TYPE_INT 0 cluster/distribute: - * lookup-unhashed GF_OPTION_TYPE_BOOL + * lookup-unhashed GF_OPTION_TYPE_BOOL cluster/unify: - * namespace GF_OPTION_TYPE_XLATOR - * scheduler GF_OPTION_TYPE_STR alu|rr|random|nufa|switch + * namespace GF_OPTION_TYPE_XLATOR + * scheduler GF_OPTION_TYPE_STR alu|rr|random|nufa|switch * self-heal GF_OPTION_TYPE_STR foreground|background|off - * optimist GF_OPTION_TYPE_BOOL + * optimist GF_OPTION_TYPE_BOOL cluster/nufa: - local-volume-name GF_OPTION_TYPE_XLATOR + local-volume-name GF_OPTION_TYPE_XLATOR cluster/stripe: - * block-size GF_OPTION_TYPE_ANY + * block-size GF_OPTION_TYPE_ANY * use-xattr GF_OPTION_TYPE_BOOL debug/trace: * include-ops (include) GF_OPTION_TYPE_STR - * exclude-ops (exclude) GF_OPTION_TYPE_STR + * exclude-ops (exclude) GF_OPTION_TYPE_STR encryption/rot-13: * encrypt-write GF_OPTION_TYPE_BOOL - * decrypt-read GF_OPTION_TYPE_BOOL + * decrypt-read GF_OPTION_TYPE_BOOL features/path-convertor: - * start-offset GF_OPTION_TYPE_INT 0-4095 - * end-offset GF_OPTION_TYPE_INT 1-4096 + * start-offset GF_OPTION_TYPE_INT 0-4095 + * end-offset GF_OPTION_TYPE_INT 1-4096 * replace-with GF_OPTION_TYPE_ANY features/trash: - * trash-dir GF_OPTION_TYPE_PATH + * trash-dir GF_OPTION_TYPE_PATH features/locks: - * mandatory-locks (mandatory) GF_OPTION_TYPE_BOOL + * mandatory-locks (mandatory) GF_OPTION_TYPE_BOOL features/filter: - * root-squashing GF_OPTION_TYPE_BOOL + * root-squashing GF_OPTION_TYPE_BOOL * read-only GF_OPTION_TYPE_BOOL * fixed-uid GF_OPTION_TYPE_INT * fixed-gid GF_OPTION_TYPE_INT - * translate-uid GF_OPTION_TYPE_ANY + * translate-uid GF_OPTION_TYPE_ANY * translate-gid GF_OPTION_TYPE_ANY - * filter-uid GF_OPTION_TYPE_ANY - * filter-gid GF_OPTION_TYPE_ANY + * filter-uid GF_OPTION_TYPE_ANY + * filter-gid GF_OPTION_TYPE_ANY features/quota: * min-free-disk-limit GF_OPTION_TYPE_PERCENT * refresh-interval GF_OPTION_TYPE_TIME - * disk-usage-limit GF_OPTION_TYPE_SIZET + * disk-usage-limit GF_OPTION_TYPE_SIZET storage/posix: * o-direct GF_OPTION_TYPE_BOOL @@ -104,16 +104,16 @@ storage/bdb: * access-mode GF_OPTION_TYPE_STR performance/read-ahead: - * force-atime-update GF_OPTION_TYPE_BOOL + * force-atime-update GF_OPTION_TYPE_BOOL * page-size GF_OPTION_TYPE_SIZET (64 * GF_UNIT_KB)-(2 * GF_UNIT_MB) - * page-count GF_OPTION_TYPE_INT 1-16 + * page-count GF_OPTION_TYPE_INT 1-16 performance/write-behind: * flush-behind GF_OPTION_TYPE_BOOL - * aggregate-size GF_OPTION_TYPE_SIZET (128 * GF_UNIT_KB)-(4 * GF_UNIT_MB) - * window-size GF_OPTION_TYPE_SIZET (512 * GF_UNIT_KB)-(1 * GF_UNIT_GB) - * enable-O_SYNC GF_OPTION_TYPE_BOOL - * disable-for-first-nbytes GF_OPTION_TYPE_SIZET 1 - (1 * GF_UNIT_MB) + * aggregate-size GF_OPTION_TYPE_SIZET (128 * GF_UNIT_KB)-(4 * GF_UNIT_MB) + * window-size GF_OPTION_TYPE_SIZET (512 * GF_UNIT_KB)-(1 * GF_UNIT_GB) + * enable-O_SYNC GF_OPTION_TYPE_BOOL + * disable-for-first-nbytes GF_OPTION_TYPE_SIZET 1 - (1 * GF_UNIT_MB) performance/symlink-cache: @@ -121,9 +121,9 @@ performance/io-threads: * thread-count GF_OPTION_TYPE_INT 1-32 performance/io-cache: - * priority GF_OPTION_TYPE_ANY - * cache-timeout (force-revalidate-timeout) GF_OPTION_TYPE_INT 0-60 - * page-size GF_OPTION_TYPE_SIZET (16 * GF_UNIT_KB)-(4 * GF_UNIT_MB) + * priority GF_OPTION_TYPE_ANY + * cache-timeout (force-revalidate-timeout) GF_OPTION_TYPE_INT 0-60 + * page-size GF_OPTION_TYPE_SIZET (16 * GF_UNIT_KB)-(4 * GF_UNIT_MB) * cache-size GF_OPTION_TYPE_SIZET (4 * GF_UNIT_MB)-(6 * GF_UNIT_GB) performance/quick-read: @@ -132,16 +132,16 @@ performance/quick-read: auth: - addr: - * auth.addr.*.allow GF_OPTION_TYPE_ANY - * auth.addr.*.reject GF_OPTION_TYPE_ANY + * auth.addr.*.allow GF_OPTION_TYPE_ANY + * auth.addr.*.reject GF_OPTION_TYPE_ANY - login: - * auth.login.*.allow GF_OPTION_TYPE_ANY + * auth.login.*.allow GF_OPTION_TYPE_ANY * auth.login.*.password GF_OPTION_TYPE_ANY scheduler/alu: - * scheduler.alu.order (alu.order) - GF_OPTION_TYPE_ANY + * scheduler.alu.order (alu.order) + GF_OPTION_TYPE_ANY * scheduler.alu.disk-usage.entry-threshold (alu.disk-usage.entry-threshold) GF_OPTION_TYPE_SIZET * scheduler.alu.disk-usage.exit-threshold (alu.disk-usage.exit-threshold) @@ -149,17 +149,17 @@ scheduler/alu: * scheduler.alu.write-usage.entry-threshold (alu.write-usage.entry-threshold) GF_OPTION_TYPE_SIZET * scheduler.alu.write-usage.exit-threshold (alu.write-usage.exit-threshold) - GF_OPTION_TYPE_SIZET + GF_OPTION_TYPE_SIZET * scheduler.alu.read-usage.entry-threshold (alu.read-usage.entry-threshold) GF_OPTION_TYPE_SIZET * scheduler.alu.read-usage.exit-threshold (alu.read-usage.exit-threshold) - GF_OPTION_TYPE_SIZET + GF_OPTION_TYPE_SIZET * scheduler.alu.open-files-usage.entry-threshold (alu.open-files-usage.entry-threshold) GF_OPTION_TYPE_INT * scheduler.alu.open-files-usage.exit-threshold (alu.open-files-usage.exit-threshold) - GF_OPTION_TYPE_INT + GF_OPTION_TYPE_INT * scheduler.read-only-subvolumes (alu.read-only-subvolumes) - GF_OPTION_TYPE_ANY + GF_OPTION_TYPE_ANY * scheduler.refresh-interval (alu.refresh-interval) GF_OPTION_TYPE_TIME * scheduler.limits.min-free-disk (alu.limits.min-free-disk) @@ -168,11 +168,11 @@ scheduler/alu: GF_OPTION_TYPE_INT scheduler/nufa: - * scheduler.refresh-interval (nufa.refresh-interval) + * scheduler.refresh-interval (nufa.refresh-interval) GF_OPTION_TYPE_TIME - * scheduler.limits.min-free-disk (nufa.limits.min-free-disk) + * scheduler.limits.min-free-disk (nufa.limits.min-free-disk) GF_OPTION_TYPE_PERCENT - * scheduler.local-volume-name (nufa.local-volume-name) + * scheduler.local-volume-name (nufa.local-volume-name) GF_OPTION_TYPE_XLATOR scheduler/random: @@ -204,20 +204,20 @@ transport/ib-verbs: * transport.ib-verbs.work-request-recv-count (ib-verbs-work-request-recv-count) GF_OPTION_TYPE_INT * remote-port (transport.remote-port,transport.ib-verbs.remote-port) - GF_OPTION_TYPE_INT - * transport.ib-verbs.listen-port GF_OPTION_TYPE_INT - * transport.ib-verbs.connect-path (connect-path) GF_OPTION_TYPE_ANY - * transport.ib-verbs.bind-path (bind-path) GF_OPTION_TYPE_ANY - * transport.ib-verbs.listen-path (listen-path) GF_OPTION_TYPE_ANY + GF_OPTION_TYPE_INT + * transport.ib-verbs.listen-port GF_OPTION_TYPE_INT + * transport.ib-verbs.connect-path (connect-path) GF_OPTION_TYPE_ANY + * transport.ib-verbs.bind-path (bind-path) GF_OPTION_TYPE_ANY + * transport.ib-verbs.listen-path (listen-path) GF_OPTION_TYPE_ANY * transport.address-family (address-family) GF_OPTION_TYPE_STR inet|inet6|inet/inet6| inet6/inet|unix|inet-sdp transport/socket: - * transport.remote-port (remote-port,transport.socket.remote-port) GF_OPTION_TYPE_INT - * transport.socket.listen-port (listen-port) GF_OPTION_TYPE_INT - * transport.socket.bind-address (bind-address) GF_OPTION_TYPE_ANY - * transport.socket.connect-path (connect-path) GF_OPTION_TYPE_ANY - * transport.socket.bind-path (bind-path) GF_OPTION_TYPE_ANY + * transport.remote-port (remote-port,transport.socket.remote-port) GF_OPTION_TYPE_INT + * transport.socket.listen-port (listen-port) GF_OPTION_TYPE_INT + * transport.socket.bind-address (bind-address) GF_OPTION_TYPE_ANY + * transport.socket.connect-path (connect-path) GF_OPTION_TYPE_ANY + * transport.socket.bind-path (bind-path) GF_OPTION_TYPE_ANY * transport.socket.listen-path (listen-path) GF_OPTION_TYPE_ANY * transport.address-family (address-family) GF_OPTION_TYPE_STR inet|inet6| inet/inet6|inet6/inet| diff --git a/doc/user-guide/unify.odg b/doc/legacy/unify.odg Binary files differindex ccaa9bf16..ccaa9bf16 100644 --- a/doc/user-guide/unify.odg +++ b/doc/legacy/unify.odg diff --git a/doc/user-guide/unify.pdf b/doc/legacy/unify.pdf Binary files differindex c22027f66..c22027f66 100644 --- a/doc/user-guide/unify.pdf +++ b/doc/legacy/unify.pdf diff --git a/doc/user-guide/user-guide.info b/doc/legacy/user-guide.info index 494c0e94e..2bbadb351 100644 --- a/doc/user-guide/user-guide.info +++ b/doc/legacy/user-guide.info @@ -6,7 +6,7 @@ END-INFO-DIR-ENTRY This is the user manual for GlusterFS 2.0. - Copyright (C) 2007-2010 Gluster, Inc. Permission is granted to + Copyright (c) 2007-2011 Gluster, Inc. Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.2 or any later version published by the Free Software Foundation; with no Invariant Sections, no @@ -21,7 +21,7 @@ GlusterFS 2.0 User Guide This is the user manual for GlusterFS 2.0. - Copyright (C) 2007-2010 Gluster, Inc. Permission is granted to + Copyright (c) 2007-2011 Gluster, Inc. Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.2 or any later version published by the Free Software Foundation; with no Invariant Sections, no @@ -130,7 +130,7 @@ suggestions. A huge thanks to them all. Patrick Negri - for TCP non-blocking connect. http://gluster.org/core-team.php (<list-hacking@gluster.com>) - Gluster + Gluster File: user-guide.info, Node: Introduction, Next: Installation and Invocation, Prev: Acknowledgements, Up: Top @@ -160,15 +160,15 @@ makes them all appear to be a part of the same filesystem. ================= You can reach us through the mailing list *gluster-devel* -(<gluster-devel@nongnu.org>). +(<gluster-devel@nongnu.org>). You can also find many of the developers on IRC, on the `#gluster' -channel on Freenode (<irc.freenode.net>). +channel on Freenode (<irc.freenode.net>). The GlusterFS documentation wiki is also useful: <http://gluster.org/docs/index.php/GlusterFS> - For commercial support, you can contact Gluster at: + For commercial support, you can contact Gluster at: 3194 Winding Vista Common Fremont, CA 94539 @@ -397,8 +397,8 @@ command-line options accepted by it. `--volfile-server-port=<port-number>' Listening port number of volfile server. -`--volfile-server-transport=[socket|ib-verbs]' - Transport type to get volfile from server. [default: `socket'] +`--volfile-server-transport=[tcp|ib-verbs]' + Transport type to get volfile from server. [default: `tcp'] `--xlator-options=<volume-name.option=value>' Add/override a translator option for a volume with specified value. @@ -467,8 +467,8 @@ filesystem to appear. Example: `--volfile-server-port=<port-number>' Listening port number of volfile server. -`--volfile-server-transport=[socket|ib-verbs]' - Transport type to get volfile from server. [default: `socket'] +`--volfile-server-transport=[tcp|ib-verbs]' + Transport type to get volfile from server. [default: `tcp'] `--xlator-options=<volume-name.option=value>' Add/override a translator option for a volume with specified value. @@ -907,7 +907,7 @@ and the client. Whether to make the connection attempt asynchronous. `remote-port <n> (24007)' - Server port to connect to. + Server port to connect to. `remote-host <hostname> *' Hostname or IP address of the server. If the host name resolves to @@ -952,7 +952,7 @@ always best to use `ib-verbs'. Use `ib-sdp' only if you cannot get Whether to make the connection attempt asynchronous. `remote-port <n> (24007)' - Server port to connect to. + Server port to connect to. `remote-host <hostname> *' Hostname or IP address of the server. If the host name resolves to @@ -2057,7 +2057,7 @@ the `--log-file' option (See *note Client::). ======================= `modprobe fuse' fails with: "Unknown symbol in module, or unknown -parameter". +parameter". If you are using fuse-2.6.x on Redhat Enterprise Linux Work Station 4 and Advanced Server 4 with 2.6.9-42.ELlargesmp, 2.6.9-42.ELsmp, @@ -2188,7 +2188,7 @@ Appendix A GNU Free Documentation Licence This License is a kind of "copyleft", which means that derivative works of the document must themselves be free in the same sense. - It complements the GNU Affero General Public License, which is a copyleft + It complements the GNU General Public License, which is a copyleft license designed for free software. We have designed this License in order to use it for manuals for @@ -2591,7 +2591,7 @@ situation. If your document contains nontrivial examples of program code, we recommend releasing these examples in parallel under your choice of -free software license, such as the GNU Affero General Public License, to +free software license, such as the GNU General Public License, to permit their use in free software. diff --git a/doc/user-guide/user-guide.pdf b/doc/legacy/user-guide.pdf Binary files differindex ed7bd2a99..ed7bd2a99 100644 --- a/doc/user-guide/user-guide.pdf +++ b/doc/legacy/user-guide.pdf diff --git a/doc/user-guide/user-guide.texi b/doc/legacy/user-guide.texi index b679dee02..8e429853f 100644 --- a/doc/user-guide/user-guide.texi +++ b/doc/legacy/user-guide.texi @@ -10,7 +10,7 @@ @copying This is the user manual for GlusterFS 2.0. -Copyright @copyright{} 2007-2010 @email{@b{Gluster}} , Inc. Permission is granted to +Copyright @copyright{} 2007-2011 @email{@b{Gluster}} , Inc. Permission is granted to copy, distribute and/or modify this document under the terms of the @acronym{GNU} Free Documentation License, Version 1.2 or any later version published by the Free Software Foundation; with no Invariant @@ -23,7 +23,7 @@ Documentation License''. @title GlusterFS 2.0 User Guide [DRAFT] @subtitle January 15, 2008 @author http://gluster.org/core-team.php -@author @email{@b{Gluster}} +@author @email{@b{Gluster}} @page @vskip 0pt plus 1filll @insertcopying @@ -36,78 +36,78 @@ Documentation License''. @insertcopying @menu -* Acknowledgements:: -* Introduction:: -* Installation and Invocation:: -* Concepts:: -* Translators:: -* Usage Scenarios:: -* Troubleshooting:: -* GNU Free Documentation Licence:: -* Index:: +* Acknowledgements:: +* Introduction:: +* Installation and Invocation:: +* Concepts:: +* Translators:: +* Usage Scenarios:: +* Troubleshooting:: +* GNU Free Documentation Licence:: +* Index:: @detailmenu --- The Detailed Node Listing --- Installation and Invocation -* Pre requisites:: -* Getting GlusterFS:: -* Building:: -* Running GlusterFS:: -* A Tutorial Introduction:: +* Pre requisites:: +* Getting GlusterFS:: +* Building:: +* Running GlusterFS:: +* A Tutorial Introduction:: Running GlusterFS -* Server:: -* Client:: +* Server:: +* Client:: Concepts -* Filesystems in Userspace:: -* Translator:: -* Volume specification file:: +* Filesystems in Userspace:: +* Translator:: +* Volume specification file:: Translators -* Storage Translators:: -* Client and Server Translators:: -* Clustering Translators:: -* Performance Translators:: -* Features Translators:: +* Storage Translators:: +* Client and Server Translators:: +* Clustering Translators:: +* Performance Translators:: +* Features Translators:: Storage Translators -* POSIX:: +* POSIX:: Client and Server Translators -* Transport modules:: -* Client protocol:: -* Server protocol:: +* Transport modules:: +* Client protocol:: +* Server protocol:: Clustering Translators -* Unify:: -* Replicate:: -* Stripe:: +* Unify:: +* Replicate:: +* Stripe:: Performance Translators -* Read Ahead:: -* Write Behind:: -* IO Threads:: -* IO Cache:: +* Read Ahead:: +* Write Behind:: +* IO Threads:: +* IO Cache:: -Features Translators +Features Translators -* POSIX Locks:: -* Fixed ID:: +* POSIX Locks:: +* Fixed ID:: Miscellaneous Translators -* ROT-13:: -* Trace:: +* ROT-13:: +* Trace:: @end detailmenu @end menu @@ -120,7 +120,7 @@ Miscellaneous Translators @node Acknowledgements @unnumbered Acknowledgements GlusterFS continues to be a wonderful and enriching experience for all -of us involved. +of us involved. GlusterFS development would not have been possible at this pace if not for our enthusiastic users. People from around the world have @@ -142,7 +142,7 @@ Jacques Mattheij - for Europe mirror. Patrick Negri - for TCP non-blocking connect. @flushright http://gluster.org/core-team.php (@email{list-hacking@@gluster.com}) -@email{@b{Gluster}} +@email{@b{Gluster}} @end flushright @node Introduction @@ -166,7 +166,7 @@ Need for distributed filesystems @end itemize @section Contacting us -You can reach us through the mailing list @strong{gluster-devel} +You can reach us through the mailing list @strong{gluster-devel} (@email{gluster-devel@@nongnu.org}). @cindex GlusterFS mailing list @@ -197,11 +197,11 @@ You can also email us at @email{support@@gluster.com}. @chapter Installation and Invocation @menu -* Pre requisites:: -* Getting GlusterFS:: -* Building:: -* Running GlusterFS:: -* A Tutorial Introduction:: +* Pre requisites:: +* Getting GlusterFS:: +* Building:: +* Running GlusterFS:: +* A Tutorial Introduction:: @end menu @node Pre requisites @@ -247,7 +247,7 @@ our patched version of the @acronym{FUSE} kernel module. See Patched FUSE for de @subsection Patched FUSE -The GlusterFS project maintains a patched version of @acronym{FUSE} meant to be used +The GlusterFS project maintains a patched version of @acronym{FUSE} meant to be used with GlusterFS. The patches increase GlusterFS performance. It is recommended that all users use the patched @acronym{FUSE}. @@ -311,7 +311,7 @@ $ cd glusterfs-<version> If you checked out the source from the Arch repository, you'll need to run @command{./autogen.sh} first. Note that you'll need to have -Autoconf and Automake installed for this. +Autoconf and Automake installed for this. Run @command{configure}. @@ -371,8 +371,8 @@ paths with the prefix. @section Running GlusterFS @menu -* Server:: -* Client:: +* Server:: +* Client:: @end menu @node Server @@ -386,7 +386,7 @@ of the GlusterFS server program and all the command-line options accepted by it. @cartouche @table @code Basic Options -@item -f, --volfile=<path> +@item -f, --volfile=<path> Use the volume file as the volume specification. @item -s, --volfile-server=<hostname> @@ -396,7 +396,7 @@ Basic Options Specify the path for the log file. @item -L, --log-level=<level> - Set the log level for the server. Log level should be one of @acronym{DEBUG}, + Set the log level for the server. Log level should be one of @acronym{DEBUG}, @acronym{WARNING}, @acronym{ERROR}, @acronym{CRITICAL}, or @acronym{NONE}. Advanced Options @@ -404,10 +404,10 @@ Advanced Options Run in debug mode. This option sets --no-daemon, --log-level to DEBUG and --log-file to console. -@item -N, --no-daemon +@item -N, --no-daemon Run glusterfsd as a foreground process. -@item -p, --pid-file=<path> +@item -p, --pid-file=<path> Path for the @acronym{PID} file. @item --volfile-id=<key> @@ -416,20 +416,20 @@ Advanced Options @item --volfile-server-port=<port-number> Listening port number of volfile server. -@item --volfile-server-transport=[socket|ib-verbs] - Transport type to get volfile from server. [default: @command{socket}] +@item --volfile-server-transport=[tcp|ib-verbs] + Transport type to get volfile from server. [default: @command{tcp}] @item --xlator-options=<volume-name.option=value> Add/override a translator option for a volume with specified value. Miscellaneous Options -@item -?, --help +@item -?, --help Show this help text. -@item --usage +@item --usage Display a short usage message. -@item -V, --version +@item -V, --version Show version information. @end table @end cartouche @@ -464,7 +464,7 @@ The command-line options are detailed below. @table @code Basic Options -@item -f, --volfile=<path> +@item -f, --volfile=<path> Use the volume file as the volume specification. @item -s, --volfile-server=<hostname> @@ -474,7 +474,7 @@ Basic Options Specify the path for the log file. @item -L, --log-level=<level> - Set the log level for the server. Log level should be one of @acronym{DEBUG}, + Set the log level for the server. Log level should be one of @acronym{DEBUG}, @acronym{WARNING}, @acronym{ERROR}, @acronym{CRITICAL}, or @acronym{NONE}. Advanced Options @@ -482,10 +482,10 @@ Advanced Options Run in debug mode. This option sets --no-daemon, --log-level to DEBUG and --log-file to console. -@item -N, --no-daemon +@item -N, --no-daemon Run @command{glusterfs} as a foreground process. -@item -p, --pid-file=<path> +@item -p, --pid-file=<path> Path for the @acronym{PID} file. @item --volfile-id=<key> @@ -494,8 +494,8 @@ Advanced Options @item --volfile-server-port=<port-number> Listening port number of volfile server. -@item --volfile-server-transport=[socket|ib-verbs] - Transport type to get volfile from server. [default: @command{socket}] +@item --volfile-server-transport=[tcp|ib-verbs] + Transport type to get volfile from server. [default: @command{tcp}] @item --xlator-options=<volume-name.option=value> Add/override a translator option for a volume with specified value. @@ -512,14 +512,14 @@ Advanced Options automatically if kernel supports big writes (>= 2.6.26). @item -e, --entry-timeout=<n> - Entry timeout for directory entries in the kernel, in seconds. + Entry timeout for directory entries in the kernel, in seconds. Defaults to 1 second. Missellaneous Options -@item -?, --help +@item -?, --help Show this help information. -@item -V, --version +@item -V, --version Show version information. @end table @end cartouche @@ -527,7 +527,7 @@ Missellaneous Options @node A Tutorial Introduction @section A Tutorial Introduction -This section will show you how to quickly get GlusterFS up and running. We'll +This section will show you how to quickly get GlusterFS up and running. We'll configure GlusterFS as a simple network filesystem, with one server and one client. In this mode of usage, GlusterFS can serve as a replacement for NFS. @@ -545,18 +545,18 @@ be run on the server will be shown with the prompt: Our goal is to make a directory on the @emph{server} (say, @command{/export}) accessible to the @emph{client}. -First of all, get GlusterFS installed on both the machines, as described in the +First of all, get GlusterFS installed on both the machines, as described in the previous sections. Make sure you have the @acronym{FUSE} kernel module loaded. You -can ensure this by running: +can ensure this by running: @example [root@@server]# modprobe fuse @end example Before we can run the GlusterFS client or server programs, we need to write -two files called @emph{volume specifications} (equivalently refered to as @emph{volfiles}). +two files called @emph{volume specifications} (equivalently refered to as @emph{volfiles}). The volfile describes the @emph{translator tree} on a node. The next chapter will -explain the concepts of `translator' and `volume specification' in detail. For now, +explain the concepts of `translator' and `volume specification' in detail. For now, just assume that the volfile is like an NFS @command{/etc/export} file. On the server, create a text file somewhere (we'll assume the path @@ -572,7 +572,7 @@ end-volume volume server type protocol/server subvolumes colon-o - option transport-type tcp + option transport-type tcp option auth.addr.colon-o.allow * end-volume @end example @@ -625,9 +625,9 @@ working as a network file system. @chapter Concepts @menu -* Filesystems in Userspace:: -* Translator:: -* Volume specification file:: +* Filesystems in Userspace:: +* Translator:: +* Volume specification file:: @end menu @node Filesystems in Userspace @@ -639,16 +639,16 @@ is a kernel module/library that allows us to write a filesystem completely in userspace. @acronym{FUSE} consists of a kernel module which interacts with the userspace -implementation using a device file @code{/dev/fuse}. When a process +implementation using a device file @code{/dev/fuse}. When a process makes a syscall on a @acronym{FUSE} filesystem, @acronym{VFS} hands the request to the @acronym{FUSE} module, which writes the request to @code{/dev/fuse}. The userspace implementation polls @code{/dev/fuse}, and when a request arrives, processes it and writes the result back to @code{/dev/fuse}. The kernel then -reads from the device file and returns the result to the user process. +reads from the device file and returns the result to the user process. In case of GlusterFS, the userspace program is the GlusterFS client. The control flow is shown in the diagram below. The GlusterFS client -services the request by sending it to the server, which in turn +services the request by sending it to the server, which in turn hands it to the local @acronym{POSIX} filesystem. @center @image{fuse,44pc,,,.pdf} @@ -752,7 +752,7 @@ or ``forty-two''. line is considered the value; it is up to the translator to parse it. @item @emph{subvolume1}, @emph{subvolume2}, @dots{} - Volume names of sub-volumes. The sub-volumes must already have been defined earlier + Volume names of sub-volumes. The sub-volumes must already have been defined earlier in the file. @end table @@ -797,11 +797,11 @@ end-volume @chapter Translators @menu -* Storage Translators:: -* Client and Server Translators:: -* Clustering Translators:: -* Performance Translators:: -* Features Translators:: +* Storage Translators:: +* Client and Server Translators:: +* Clustering Translators:: +* Performance Translators:: +* Features Translators:: * Miscellaneous Translators:: @end menu @@ -823,12 +823,12 @@ Other storage backends are planned for the future. One of the possibilities is a Amazon S3 translator. Amazon S3 is an unlimited online storage service accessible through a web services @acronym{API}. The S3 translator will allow you to access the storage as a normal @acronym{POSIX} filesystem. -@footnote{Some more discussion about this can be found at: +@footnote{Some more discussion about this can be found at: http://developer.amazonwebservices.com/connect/message.jspa?messageID=52873} @menu -* POSIX:: +* POSIX:: * BDB:: @end menu @@ -843,14 +843,14 @@ filesystem as its ``backend'' to actually store files and directories. This can be any filesystem that supports extended attributes (@acronym{EXT3}, ReiserFS, @acronym{XFS}, ...). Extended attributes are used by some translators to store metadata, for -example, by the replicate and stripe translators. See +example, by the replicate and stripe translators. See @ref{Replicate} and @ref{Stripe}, respectively for details. @cartouche @table @code @item directory <path> The directory on the local filesystem which is to be used for storage. -@end table +@end table @end cartouche @node BDB @@ -862,7 +862,7 @@ type storage/bdb The @command{BDB} translator uses a @acronym{Berkeley DB} database as its ``backend'' to actually store files as key-value pair in the database and directories as regular @acronym{POSIX} directories. Note that @acronym{BDB} -does not provide extended attribute support for regular files. Do not use +does not provide extended attribute support for regular files. Do not use @acronym{BDB} as storage translator while using any translator that demands extended attributes on ``backend''. @@ -892,9 +892,9 @@ translator tree over the network or access a remote GlusterFS server. These two translators implement GlusterFS's network protocol. @menu -* Transport modules:: -* Client protocol:: -* Server protocol:: +* Transport modules:: +* Client protocol:: +* Server protocol:: @end menu @node Transport modules @@ -962,7 +962,7 @@ This module accepts the same options as @command{tcp} @cindex infiniband transport -InfiniBand is a scalable switched fabric interconnect mechanism +InfiniBand is a scalable switched fabric interconnect mechanism primarily used in high-performance computing. InfiniBand can deliver data throughput of the order of 10 Gbit/s, with latencies of 4-5 ms. @@ -970,7 +970,7 @@ The @command{ib-verbs} transport accesses the InfiniBand hardware through the ``verbs'' @acronym{API}, which is the lowest level of software access possible and which gives the highest performance. On InfiniBand hardware, it is always best to use @command{ib-verbs}. Use @command{ib-sdp} only if you cannot get -@command{ib-verbs} working for some reason. +@command{ib-verbs} working for some reason. The @command{ib-verbs} client module accepts the following options: @@ -1049,7 +1049,7 @@ translator tree. @item transport-type [tcp,ib-sdp,ib-verbs] (tcp) The transport type to use. You should use the client versions of all the -transport modules (@command{tcp}, @command{ib-sdp}, +transport modules (@command{tcp}, @command{ib-sdp}, @command{ib-verbs}). @item remote-subvolume <volume_name> * The name of the volume on the remote host to attach to. Note that @@ -1075,7 +1075,7 @@ remote GlusterFS clients. @table @code @item client-volume-filename <path> (<CONFDIR>/glusterfs-client.vol) The volume specification file to use for the client. This is the file the -client will receive when it is invoked with the @command{--server} option +client will receive when it is invoked with the @command{--server} option (@ref{Client}). @item transport-type [tcp,ib-verbs,ib-sdp] (tcp) @@ -1106,9 +1106,9 @@ translator allows a file to be spread across many server nodes. The following se look at each of these translators in detail. @menu -* Unify:: -* Replicate:: -* Stripe:: +* Unify:: +* Replicate:: +* Stripe:: @end menu @node Unify @@ -1121,7 +1121,7 @@ type cluster/unify The unify translator presents a `unified' view of all its sub-volumes. That is, it makes the union of all its sub-volumes appear as a single volume. It is the -unify translator that gives GlusterFS the ability to access an arbitrarily +unify translator that gives GlusterFS the ability to access an arbitrarily large amount of storage. For unify to work correctly, certain invariants need to be maintained across @@ -1144,7 +1144,7 @@ Looking at the second requirement, you might wonder how one can accomplish storing redundant copies of a file, if no file can exist multiple times. To answer, we must remember that these invariants are from @emph{unify's perspective}. A translator such as replicate at a lower -level in the translator tree than unify may subvert this picture. +level in the translator tree than unify may subvert this picture. The first invariant might seem quite tedious to ensure. We shall see later that this is not so, since unify's @emph{self-heal} mechanism @@ -1241,8 +1241,8 @@ are allowed. For example: @command{option alu.limits.min-free-disk 5GB}. @item alu.read-usage.exit-threshold <%> (5) @item alu.open-files-usage.entry-threshold <n> (1000) @item alu.open-files-usage.exit-threshold <n> (100) -@item alu.limits.min-free-disk <%> -@item alu.limits.max-open-files <n> +@item alu.limits.min-free-disk <%> +@item alu.limits.max-open-files <n> @end table @end cartouche @@ -1259,7 +1259,7 @@ files are mostly similar in size and I/O access pattern, this scheduler is a good choice. RR scheduler checks for free disk space on the server before scheduling, so you can know when to add another server node. The default value of min-free-disk is 5% and is -checked on file creation calls, with atleast 10 seconds (by default) +checked on file creation calls, with atleast 10 seconds (by default) elapsing between two checks. Options: @@ -1315,7 +1315,7 @@ than a specified amount (5% by default) then @acronym{NUFA} schedules files among the other child volumes in a round-robin fashion. @acronym{NUFA} is named after the similar strategy used for memory access, -@acronym{NUMA}@footnote{Non-Uniform Memory Access: +@acronym{NUMA}@footnote{Non-Uniform Memory Access: @indicateurl{http://en.wikipedia.org/wiki/Non-Uniform_Memory_Access}}. @cartouche @@ -1325,7 +1325,7 @@ Minimum disk space that must be free (local or remote) for @acronym{NUFA} to sch file to it. @item nufa.refresh-interval <t> (10 seconds) Time between two successive free disk space checks. -@item nufa.local-volume-name <volume> +@item nufa.local-volume-name <volume> The name of the volume corresponding to the local system. This volume must be one of the children of the unify volume. This option is mandatory. @end table @@ -1397,13 +1397,13 @@ end-volume @end example This sample configuration will replicate all directories and files on -brick1, brick2 and brick3. +brick1, brick2 and brick3. All the read operations happen from the first alive child. If all the three sub-volumes are up, reads will be done from brick1; if brick1 is down read will be done from brick2. In case read() was being done on brick1 and it goes down, replicate transparently falls back to -brick2. +brick2. The next release of GlusterFS will add the following features: @itemize @@ -1496,7 +1496,7 @@ type cluster/stripe The stripe translator distributes the contents of a file over its sub-volumes. It does this by creating a file equal in size to the total size of the file on each of its sub-volumes. It then writes only -a part of the file to each sub-volume, leaving the rest of it empty. +a part of the file to each sub-volume, leaving the rest of it empty. These empty regions are called `holes' in Unix terminology. The holes do not consume any disk space. @@ -1504,14 +1504,14 @@ The diagram below makes this clear. @center @image{stripe,44pc,,,.pdf} -You can configure stripe so that only filenames matching a pattern -are striped. You can also configure the size of the data to be stored +You can configure stripe so that only filenames matching a pattern +are striped. You can also configure the size of the data to be stored on each sub-volume. @cartouche @table @code @item block-size <pattern>:<size> (*:0 no striping) -Distribute files matching @command{<pattern>} over the sub-volumes, +Distribute files matching @command{<pattern>} over the sub-volumes, storing at least @command{<size>} on each sub-volume. For example, @example @@ -1530,9 +1530,9 @@ different sizes for different file name patterns. @section Performance Translators @menu -* Read Ahead:: -* Write Behind:: -* IO Threads:: +* Read Ahead:: +* Write Behind:: +* IO Threads:: * IO Cache:: * Booster:: @end menu @@ -1547,9 +1547,9 @@ type performance/read-ahead The read-ahead translator pre-fetches data in advance on every read. This benefits applications that mostly process files in sequential order, since the next block of data will already be available by the time the -application is done with the current one. +application is done with the current one. -Additionally, the read-ahead translator also behaves as a read-aggregator. +Additionally, the read-ahead translator also behaves as a read-aggregator. Many small read operations are combined and issued as fewer, larger read requests to the server. @@ -1557,7 +1557,7 @@ Read-ahead deals in ``pages'' as the unit of data fetched. The page size is configurable, as is the ``page count'', which is the number of pages that are pre-fetched. -Read-ahead is best used with InfiniBand (using the ib-verbs transport). +Read-ahead is best used with InfiniBand (using the ib-verbs transport). On FastEthernet and Gigabit Ethernet networks, GlusterFS can achieve the link-maximum throughput even without read-ahead, making it quite superflous. @@ -1577,7 +1577,7 @@ The unit of data that is pre-fetched. The number of pages that are pre-fetched. @item force-atime-update [on|off|yes|no] (off|no) Whether to force an access time (atime) update on the file on every read. Without -this, the atime will be slightly imprecise, as it will reflect the time when +this, the atime will be slightly imprecise, as it will reflect the time when the read-ahead translator read the data, not when the application actually read it. @end table @end cartouche @@ -1596,7 +1596,7 @@ write-behind translator, successive write requests can be pipelined. This mode of write-behind operation is best used on the client side, to enable decreased write latency for the application. -The write-behind translator can also aggregate write requests. If the +The write-behind translator can also aggregate write requests. If the @command{aggregate-size} option is specified, then successive writes upto that size are accumulated and written in a single operation. This mode of operation is best used on the server side, as this will decrease the disk's head movement @@ -1659,7 +1659,7 @@ It caches data upto @command{cache-size} bytes. The cache is maintained as a prioritized least-recently-used (@acronym{LRU}) list, with priorities determined by user-specified patterns to match filenames. -When the IO cache translator detects a write operation, the +When the IO cache translator detects a write operation, the cache for that file is flushed. The IO cache translator periodically verifies the consistency of @@ -1715,11 +1715,11 @@ can start your application as: The booster translator accepts no options. @node Features Translators -@section Features Translators +@section Features Translators @menu -* POSIX Locks:: -* Fixed ID:: +* POSIX Locks:: +* Fixed ID:: @end menu @node POSIX Locks @@ -1783,8 +1783,8 @@ The @acronym{GID} to send to the server @section Miscellaneous Translators @menu -* ROT-13:: -* Trace:: +* ROT-13:: +* Trace:: @end menu @node ROT-13 @@ -1799,7 +1799,7 @@ contents using the @acronym{ROT-13} algorithm. @acronym{ROT-13} is a trivial algorithm that rotates each alphabet by thirteen places. Thus, 'A' becomes 'N', 'B' becomes 'O', and 'Z' becomes 'M'. -It goes without saying that you shouldn't use this translator if you need +It goes without saying that you shouldn't use this translator if you need @emph{real} encryption (a future release of GlusterFS will have real encryption translators). @@ -1816,7 +1816,7 @@ Whether to decrypt on read @subsection Trace @cindex trace (translator) @example -type debug/trace +type debug/trace @end example The trace translator is intended for debugging purposes. When loaded, it @@ -1827,23 +1827,23 @@ level of DEBUG (See @ref{Running GlusterFS}) for trace to work. Sample trace output (lines have been wrapped for readability): @cartouche @example -2007-10-30 00:08:58 D [trace.c:1579:trace_opendir] trace: callid: 68 -(*this=0x8059e40, loc=0x8091984 @{path=/iozone3_283, inode=0x8091f00@}, +2007-10-30 00:08:58 D [trace.c:1579:trace_opendir] trace: callid: 68 +(*this=0x8059e40, loc=0x8091984 @{path=/iozone3_283, inode=0x8091f00@}, fd=0x8091d50) -2007-10-30 00:08:58 D [trace.c:630:trace_opendir_cbk] trace: +2007-10-30 00:08:58 D [trace.c:630:trace_opendir_cbk] trace: (*this=0x8059e40, op_ret=4, op_errno=1, fd=0x8091d50) -2007-10-30 00:08:58 D [trace.c:1602:trace_readdir] trace: callid: 69 +2007-10-30 00:08:58 D [trace.c:1602:trace_readdir] trace: callid: 69 (*this=0x8059e40, size=4096, offset=0 fd=0x8091d50) -2007-10-30 00:08:58 D [trace.c:215:trace_readdir_cbk] trace: +2007-10-30 00:08:58 D [trace.c:215:trace_readdir_cbk] trace: (*this=0x8059e40, op_ret=0, op_errno=0, count=4) -2007-10-30 00:08:58 D [trace.c:1624:trace_closedir] trace: callid: 71 +2007-10-30 00:08:58 D [trace.c:1624:trace_closedir] trace: callid: 71 (*this=0x8059e40, *fd=0x8091d50) -2007-10-30 00:08:58 D [trace.c:809:trace_closedir_cbk] trace: +2007-10-30 00:08:58 D [trace.c:809:trace_closedir_cbk] trace: (*this=0x8059e40, op_ret=0, op_errno=1) @end example @end cartouche @@ -1871,7 +1871,7 @@ scheduling on a single storage volume. Alternatively users can choose to have two separate volumes and hence two mount points, but the applications may demand a single storage system to host both. -This document explains how to mix file level scheduling with stripe. +This document explains how to mix file level scheduling with stripe. @subsection Configuration Brief @@ -1904,17 +1904,17 @@ addresses / access control fields to match your environment. type storage/posix option directory /export/for-unify end-volume - + volume posix-stripe type storage/posix option directory /export/for-stripe end-volume - + volume posix-namespace type storage/posix option directory /export/for-namespace end-volume - + volume server type protocol/server option transport-type tcp @@ -1963,7 +1963,7 @@ addresses / access control fields to match your environment. option remote-host 192.168.1.4 option remote-subvolume posix-unify end-volume - + volume client-stripe-1 type protocol/client option transport-type tcp @@ -1991,13 +1991,13 @@ addresses / access control fields to match your environment. option remote-host 192.168.1.4 option remote-subvolume posix-stripe end-volume - + volume unify type cluster/unify option scheduler rr subvolumes cluster-unify-1 cluster-unify-2 cluster-unify-3 cluster-unify-4 end-volume - + volume stripe type cluster/stripe option block-size *.img:2MB # All files ending with .img are striped with 2MB stripe block size. @@ -2046,13 +2046,13 @@ concludes with the suggested procedure to report bugs in GlusterFS. @subsection Server errors @example -glusterfsd: FATAL: could not open specfile: +glusterfsd: FATAL: could not open specfile: '/etc/glusterfs/glusterfsd.vol' @end example -The GlusterFS server expects the volume specification file to be +The GlusterFS server expects the volume specification file to be at @command{/etc/glusterfs/glusterfsd.vol}. The example -specification file will be installed as +specification file will be installed as @command{/etc/glusterfs/glusterfsd.vol.sample}. You need to edit it and rename it, or provide a different specification file using the @command{--spec-file} command line option (See @ref{Server}). @@ -2060,7 +2060,7 @@ the @command{--spec-file} command line option (See @ref{Server}). @vskip 4ex @example -gf_log_init: failed to open logfile "/usr/var/log/glusterfs/glusterfsd.log" +gf_log_init: failed to open logfile "/usr/var/log/glusterfs/glusterfsd.log" (Permission denied) @end example @@ -2072,7 +2072,7 @@ file using the @command{--log-file} option (See @ref{Server}). @subsection Client errors @example -fusermount: failed to access mountpoint /mnt: +fusermount: failed to access mountpoint /mnt: Transport endpoint is not connected @end example @@ -2114,7 +2114,7 @@ port instead. @vskip 4ex @example -gf_log_init: failed to open logfile "/usr/var/log/glusterfs/glusterfs.log" +gf_log_init: failed to open logfile "/usr/var/log/glusterfs/glusterfs.log" (Permission denied) @end example diff --git a/doc/user-guide/xlator.odg b/doc/legacy/xlator.odg Binary files differindex 179a65f6e..179a65f6e 100644 --- a/doc/user-guide/xlator.odg +++ b/doc/legacy/xlator.odg diff --git a/doc/user-guide/xlator.pdf b/doc/legacy/xlator.pdf Binary files differindex a07e14d67..a07e14d67 100644 --- a/doc/user-guide/xlator.pdf +++ b/doc/legacy/xlator.pdf diff --git a/doc/logging.txt b/doc/logging.txt new file mode 100644 index 000000000..b4ee45996 --- /dev/null +++ b/doc/logging.txt @@ -0,0 +1,66 @@ + +New logging framework in glusterfs is targeted for end users like +customers, community members, testers etc. This aims to bring clear, +understandable logs called user logs whereas the current logging are +considered as developer logs. The new framework brings with following +features + +* Each message is logged with proper well defined error code and each + error code has well known error message. +* A logbook has defined error code and error messages. It helps to + keep track of possible causes and remedies +* Log are sent to syslog. The syslog application can be configured to + pass them to centralized logging system +* It brings + - Remove repeated log messages + - Send alerts to users on certain events + - Run a program on events + - Call home service on events + + +Log book: +========= +A log book is a JSON formatted file error-codes.json located in top +level of glusterfs source repository. At compile time, gen-headers.py +generates libglusterfs/src/gf-error-codes.h using the log book and +gf-error-codes.h.template file. libglusterfs/src/gf-error-codes.h +consists of header definitions and helper functions to get message by +code for given locale. Currently it has _gf_get_message() function +returns message for locale 'en'. + +New entry to log book is added like + +{ + "IO_ERROR": {"code": 2233, + "message": {"en": "I/O error occurred"}}, + "SETUP_ERROR": {"code": 2240, + "message": {"en": "Setup error"}}, +} + + +Logging: +======== +The framework provides two functions + +void gf_openlog (const char *ident, int option, int facility); +void gf_syslog (int error_code, int facility_priority, char *format, ...); + +Consumers need to call gf_openlog() prior to gf_syslog() like the way +traditional syslog function calls. error_code is mandatory when using +gf_syslog(). For example, + +gf_openlog (NULL, -1, -1); +gf_syslog (GF_ERR_DEV, LOG_ERR, "error reading configuration file"); + +The logs are sent in CEE format (http://cee.mitre.org/) to syslog. +Its targeted to rsyslog syslog server. + +This log framework is enabled at compile time by default. This can be +disabled by passing '--disable-syslog' to ./configure or '--without +syslog' to rpmbuild + +Even though its enabled at compile time, its required to have +/etc/glusterfs/logger.conf file to make it into effect before starting +gluster services + +Currently all gluster logs are sent with error code GF_ERR_DEV. diff --git a/doc/mount.glusterfs.8 b/doc/mount.glusterfs.8 index 0552f7f61..01b7f7554 100644 --- a/doc/mount.glusterfs.8 +++ b/doc/mount.glusterfs.8 @@ -1,19 +1,10 @@ -.\" Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> +.\" Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> .\" This file is part of GlusterFS. .\" -.\" GlusterFS is free software; you can redistribute it and/or modify -.\" it under the terms of the GNU Affero General Public License as published -.\" by the Free Software Foundation; either version 3 of the License, -.\" or (at your option) any later version. -.\" -.\" GlusterFS is distributed in the hope that it will be useful, but -.\" WITHOUT ANY WARRANTY; without even the implied warranty of -.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -.\" Affero General Public License for more details. -.\" -.\" You should have received a copy of the GNU Affero General Public License -.\" long with this program. If not, see -.\" <http://www.gnu.org/licenses/>. +.\" This file is licensed to you under your choice of the GNU Lesser +.\" General Public License, version 3 or any later version (LGPLv3 or +.\" later), or the GNU General Public License, version 2 (GPLv2), in all +.\" cases as published by the Free Software Foundation. .\" .\" .\" @@ -57,7 +48,7 @@ Mount the filesystem read-only Volume key or name of the volume file to be fetched from server .TP \fBtransport=\fRTRANSPORT-TYPE -Transport type to get volume file from server [default: socket] +Transport type to get volume file from server [default: tcp] .TP \fBvolume\-name=\fRVOLUME-NAME Volume name to be used for MOUNT-POINT [default: top most volume in diff --git a/doc/qa/qa-client.vol b/doc/qa/qa-client.vol deleted file mode 100644 index 176dda589..000000000 --- a/doc/qa/qa-client.vol +++ /dev/null @@ -1,170 +0,0 @@ -# This spec file should be used for testing before any release -# - -# 1st client -volume client1 - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport -# option transport-type ib-verbs # for ib-verbs transport -# option transport.ib-verbs.work-request-send-size 131072 -# option transport.ib-verbs.work-request-send-count 64 -# option transport.ib-verbs.work-request-recv-size 131072 -# option transport.ib-verbs.work-request-recv-count 64 - option remote-host 127.0.0.1 - option remote-subvolume ra1 -end-volume - -# 2nd client -volume client2 - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport -# option transport-type ib-verbs # for ib-verbs transport - option remote-host 127.0.0.1 - option remote-subvolume ra2 -end-volume - -# 3rd client -volume client3 - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport -# option transport-type ib-verbs # for ib-verbs transport - option remote-host 127.0.0.1 - option remote-subvolume ra3 -end-volume - -# 4th client -volume client4 - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport -# option transport-type ib-verbs # for ib-verbs transport - option remote-host 127.0.0.1 - option remote-subvolume ra4 -end-volume - -# 5th client -volume client5 - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport -# option transport-type ib-verbs # for ib-verbs transport - option remote-host 127.0.0.1 - option remote-subvolume ra5 -end-volume - -# 6th client -volume client6 - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport -# option transport-type ib-verbs # for ib-verbs transport - option remote-host 127.0.0.1 - option remote-subvolume ra6 -end-volume - -# 7th client -volume client7 - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport -# option transport-type ib-verbs # for ib-verbs transport - option remote-host 127.0.0.1 - option remote-subvolume ra7 -end-volume - -# 8th client -volume client8 - type protocol/client - option transport-type tcp # for TCP/IP transport -# option transport-type ib-sdp # for Infiniband transport -# option transport-type ib-verbs # for ib-verbs transport - option remote-host 127.0.0.1 - option remote-subvolume ra8 -end-volume - -# 1st Stripe (client1 client2) -volume stripe1 - type cluster/stripe - subvolumes client1 client2 - option block-size 128KB # all striped in 128kB block -end-volume - -# 2st Stripe (client3 client4) -volume stripe2 - type cluster/stripe - subvolumes client3 client4 - option block-size 128KB # all striped in 128kB block -end-volume - -# 3st Stripe (client5 client6) -volume stripe3 - type cluster/stripe - subvolumes client5 client6 - option block-size 128KB # all striped in 128kB block -end-volume - -# 4st Stripe (client7 client8) -volume stripe4 - type cluster/stripe - subvolumes client7 client8 - option block-size 128KB # all striped in 128kB block -end-volume - - -# 1st replicate -volume replicate1 - type cluster/replicate - subvolumes stripe1 stripe2 -end-volume - -# 2nd replicate -volume replicate2 - type cluster/replicate - subvolumes stripe3 stripe4 -end-volume - -volume ns - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option remote-subvolume brick-ns -end-volume - -# Unify -volume unify0 - type cluster/unify - subvolumes replicate1 replicate2 -# subvolumes stripe1 stripe3 - option namespace ns - option scheduler rr # random # alu # nufa - option rr.limits.min-free-disk 1GB -# option alu.order x -# option alu.x.entry-threshold -# option alu.x.exit-threshold -end-volume - - -# ==== Performance Translators ==== -# The default options for performance translators should be the best for 90+% of the cases -volume iot - type performance/io-threads - subvolumes unify0 -end-volume - -volume wb - type performance/write-behind - subvolumes iot -end-volume - -volume ioc - type performance/io-cache - subvolumes wb -end-volume - -volume ra - type performance/read-ahead - subvolumes ioc -end-volume diff --git a/doc/qa/qa-high-avail-client.vol b/doc/qa/qa-high-avail-client.vol deleted file mode 100644 index 69cb8dd30..000000000 --- a/doc/qa/qa-high-avail-client.vol +++ /dev/null @@ -1,17 +0,0 @@ -volume client - type protocol/client - option transport-type tcp - option remote-host localhost - option transport.socket.remote-port 7001 - option remote-subvolume server1-iot -end-volume - -volume ra - type performance/read-ahead - subvolumes client -end-volume - -volume wb - type performance/write-behind - subvolumes ra -end-volume diff --git a/doc/qa/qa-high-avail-server.vol b/doc/qa/qa-high-avail-server.vol deleted file mode 100644 index 3556b9dae..000000000 --- a/doc/qa/qa-high-avail-server.vol +++ /dev/null @@ -1,344 +0,0 @@ - -# -- server 1 -- -volume server1-posix1 - type storage/posix - option directory /tmp/ha-export1/ -end-volume - -volume server1-ns1 - type storage/posix - option directory /tmp/ha-export-ns1/ -end-volume - -volume server1-client2 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7002 - option remote-subvolume server2-posix2 -end-volume - -volume server1-ns2 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7002 - option remote-subvolume server2-ns2 -end-volume - -volume server1-client3 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7003 - option remote-subvolume server3-posix3 -end-volume - -volume server1-ns3 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7003 - option remote-subvolume server3-ns3 -end-volume - -volume server1-io1 - type performance/io-threads - option thread-count 8 - subvolumes server1-posix1 -end-volume - - -volume server1-io2 - type performance/io-threads - option thread-count 8 - subvolumes server1-client2 -end-volume - -volume server1-io3 - type performance/io-threads - option thread-count 8 - subvolumes server1-client3 -end-volume - -volume server1-ns-io1 - type performance/io-threads - option thread-count 8 - subvolumes server1-ns1 -end-volume - -volume server1-ns-io2 - type performance/io-threads - option thread-count 8 - subvolumes server1-ns2 -end-volume - -volume server1-ns-io3 - type performance/io-threads - option thread-count 8 - subvolumes server1-ns3 -end-volume - -volume server1-ns-replicate - type cluster/replicate - subvolumes server1-ns-io1 server1-ns-io2 server1-ns-io3 -end-volume - -volume server1-storage-replicate - type cluster/replicate - subvolumes server1-io1 server1-io2 server1-io3 -end-volume - -volume server1-unify - type cluster/unify - #option self-heal off - subvolumes server1-storage-replicate - option namespace server1-ns-replicate - option scheduler rr -end-volume - -volume server1-iot - type performance/io-threads - option thread-count 8 - subvolumes server1-unify -end-volume - -volume server1 - type protocol/server - option transport-type tcp - subvolumes server1-iot - option transport.socket.listen-port 7001 - option auth.addr.server1-posix1.allow * - option auth.addr.server1-ns1.allow * - option auth.addr.server1-iot.allow * -end-volume - - -# == Server2 == -volume server2-client1 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7001 - option remote-subvolume server1-posix1 -end-volume - -volume server2-ns1 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7001 - option remote-subvolume server1-ns1 -end-volume - -volume server2-posix2 - type storage/posix - option directory /tmp/ha-export2/ -end-volume - -volume server2-ns2 - type storage/posix - option directory /tmp/ha-export-ns2/ -end-volume - -volume server2-client3 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7003 - option remote-subvolume server3-posix3 -end-volume - -volume server2-ns3 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7003 - option remote-subvolume server3-ns3 -end-volume - -volume server2-io1 - type performance/io-threads - option thread-count 8 - subvolumes server2-client1 -end-volume - - -volume server2-io2 - type performance/io-threads - option thread-count 8 - subvolumes server2-posix2 -end-volume - -volume server2-io3 - type performance/io-threads - option thread-count 8 - subvolumes server2-client3 -end-volume - -volume server2-ns-io1 - type performance/io-threads - option thread-count 8 - subvolumes server2-ns1 -end-volume - -volume server2-ns-io2 - type performance/io-threads - option thread-count 8 - subvolumes server2-ns2 -end-volume - -volume server2-ns-io3 - type performance/io-threads - option thread-count 8 - subvolumes server2-ns3 -end-volume - -volume server2-ns-replicate - type cluster/replicate - subvolumes server2-ns-io1 server2-ns-io2 server2-ns-io3 -end-volume - -volume server2-storage-replicate - type cluster/replicate - subvolumes server2-io2 server2-io3 server2-io1 -end-volume - -volume server2-unify - type cluster/unify - option self-heal off - subvolumes server2-storage-replicate - option namespace server2-ns-replicate - option scheduler rr -end-volume - -volume server2-iot - type performance/io-threads - option thread-count 8 - subvolumes server2-unify -end-volume - -volume server2 - type protocol/server - option transport-type tcp - subvolumes server2-iot - option transport.socket.listen-port 7002 - option auth.addr.server2-posix2.allow * - option auth.addr.server2-ns2.allow * - option auth.addr.server2-iot.allow * -end-volume - -# == server 3 == -volume server3-client1 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7001 - option remote-subvolume server1-posix1 -end-volume - -volume server3-ns1 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7001 - option remote-subvolume server1-ns1 -end-volume - -volume server3-client2 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7002 - option remote-subvolume server2-posix2 -end-volume - -volume server3-ns2 - type protocol/client - option transport-type tcp - option remote-host 127.0.0.1 - option transport.socket.remote-port 7002 - option remote-subvolume server2-ns2 -end-volume - -volume server3-posix3 - type storage/posix - option directory /tmp/ha-export3/ -end-volume - -volume server3-ns3 - type storage/posix - option directory /tmp/ha-export-ns3/ -end-volume - -volume server3-io1 - type performance/io-threads - option thread-count 8 - subvolumes server3-client1 -end-volume - - -volume server3-io2 - type performance/io-threads - option thread-count 8 - subvolumes server3-client2 -end-volume - -volume server3-io3 - type performance/io-threads - option thread-count 8 - subvolumes server3-posix3 -end-volume - -volume server3-ns-io1 - type performance/io-threads - option thread-count 8 - subvolumes server3-ns1 -end-volume - -volume server3-ns-io2 - type performance/io-threads - option thread-count 8 - subvolumes server3-ns2 -end-volume - -volume server3-ns-io3 - type performance/io-threads - option thread-count 8 - subvolumes server3-ns3 -end-volume - -volume server3-ns-replicate - type cluster/replicate - subvolumes server3-ns-io1 server3-ns-io2 server3-ns-io3 -end-volume - -volume server3-storage-replicate - type cluster/replicate - subvolumes server3-io3 server3-io2 server3-io1 -end-volume - -volume server3-unify - type cluster/unify - option self-heal off - subvolumes server3-storage-replicate - option namespace server3-ns-replicate - option scheduler rr -end-volume - -volume server3-iot - type performance/io-threads - option thread-count 8 - subvolumes server3-unify -end-volume - -volume server3 - type protocol/server - option transport-type tcp - subvolumes server3-iot - option transport.socket.listen-port 7003 - option auth.addr.server3-posix3.allow * - option auth.addr.server3-ns3.allow * - option auth.addr.server3-iot.allow * -end-volume - diff --git a/doc/qa/qa-server.vol b/doc/qa/qa-server.vol deleted file mode 100644 index 1c245c324..000000000 --- a/doc/qa/qa-server.vol +++ /dev/null @@ -1,284 +0,0 @@ -# This spec file should be used for testing before any release -# - -# Namespace posix -volume brick-ns - type storage/posix # POSIX FS translator - option directory /tmp/export-ns # Export this directory -end-volume - -# 1st server - -volume brick1 - type storage/posix # POSIX FS translator - option directory /tmp/export1 # Export this directory -end-volume - -# == Posix-Locks == - volume plocks1 - type features/posix-locks -# option mandatory on - subvolumes brick1 - end-volume - -volume iot1 - type performance/io-threads - subvolumes plocks1 # change properly if above commented volumes needs to be included -# option <key> <value> -end-volume - -volume wb1 - type performance/write-behind - subvolumes iot1 -# option <key> <value> -end-volume - -volume ra1 - type performance/read-ahead - subvolumes wb1 -# option <key> <value> -end-volume - -volume brick2 - type storage/posix # POSIX FS translator - option directory /tmp/export2 # Export this directory -end-volume - -# == TrashCan Translator == -# volume trash2 -# type features/trash -# option trash-dir /.trashcan -# subvolumes brick2 -# end-volume - -# == Posix-Locks == -volume plocks2 - type features/posix-locks -# option <something> <something> - subvolumes brick2 -end-volume - -volume iot2 - type performance/io-threads - subvolumes plocks2 # change properly if above commented volumes needs to be included -# option <key> <value> -end-volume - -volume wb2 - type performance/write-behind - subvolumes iot2 -# option <key> <value> -end-volume - -volume ra2 - type performance/read-ahead - subvolumes wb2 -# option <key> <value> -end-volume - -volume brick3 - type storage/posix # POSIX FS translator - option directory /tmp/export3 # Export this directory -end-volume - -# == TrashCan Translator == -# volume trash3 -# type features/trash -# option trash-dir /.trashcan -# subvolumes brick3 -# end-volume - -# == Posix-Locks == -volume plocks3 - type features/posix-locks -# option <something> <something> - subvolumes brick3 -end-volume - -volume iot3 - type performance/io-threads - subvolumes plocks3 # change properly if above commented volumes needs to be included -# option <key> <value> -end-volume - -volume wb3 - type performance/write-behind - subvolumes iot3 -# option <key> <value> -end-volume - -volume ra3 - type performance/read-ahead - subvolumes wb3 -# option <key> <value> -end-volume - -volume brick4 - type storage/posix # POSIX FS translator - option directory /tmp/export4 # Export this directory -end-volume - -# == Posix-Locks == -volume plocks4 - type features/posix-locks -# option <something> <something> - subvolumes brick4 -end-volume - -volume iot4 - type performance/io-threads - subvolumes plocks4 # change properly if above commented volumes needs to be included -# option <key> <value> -end-volume - -volume wb4 - type performance/write-behind - subvolumes iot4 -# option <key> <value> -end-volume - -volume ra4 - type performance/read-ahead - subvolumes wb4 -# option <key> <value> -end-volume - -volume brick5 - type storage/posix # POSIX FS translator - option directory /tmp/export5 # Export this directory -end-volume - - -# == Posix-Locks == -volume plocks5 - type features/posix-locks -# option <something> <something> - subvolumes brick5 -end-volume - -volume iot5 - type performance/io-threads - subvolumes plocks5 # change properly if above commented volumes needs to be included -# option <key> <value> -end-volume - -volume wb5 - type performance/write-behind - subvolumes iot5 -# option <key> <value> -end-volume - -volume ra5 - type performance/read-ahead - subvolumes wb5 -# option <key> <value> -end-volume - -volume brick6 - type storage/posix # POSIX FS translator - option directory /tmp/export6 # Export this directory -end-volume - -# == Posix-Locks == -volume plocks6 - type features/posix-locks -# option <something> <something> - subvolumes brick6 -end-volume - -volume iot6 - type performance/io-threads - subvolumes plocks6 # change properly if above commented volumes needs to be included -# option <key> <value> -end-volume - -volume wb6 - type performance/write-behind - subvolumes iot6 -# option <key> <value> -end-volume - -volume ra6 - type performance/read-ahead - subvolumes wb6 -# option <key> <value> -end-volume - -volume brick7 - type storage/posix # POSIX FS translator - option directory /tmp/export7 # Export this directory -end-volume - -# == Posix-Locks == -volume plocks7 - type features/posix-locks -# option <something> <something> - subvolumes brick7 -end-volume - -volume iot7 - type performance/io-threads - subvolumes plocks7 # change properly if above commented volumes needs to be included -# option <key> <value> -end-volume - -volume wb7 - type performance/write-behind - subvolumes iot7 -# option <key> <value> -end-volume - -volume ra7 - type performance/read-ahead - subvolumes wb7 -# option <key> <value> -end-volume - -volume brick8 - type storage/posix # POSIX FS translator - option directory /tmp/export8 # Export this directory -end-volume - -# == Posix-Locks == -volume plocks8 - type features/posix-locks -# option <something> <something> - subvolumes brick8 -end-volume - -volume iot8 - type performance/io-threads - subvolumes plocks8 # change properly if above commented volumes needs to be included -# option <key> <value> -end-volume - -volume wb8 - type performance/write-behind - subvolumes iot8 -# option <key> <value> -end-volume - -volume ra8 - type performance/read-ahead - subvolumes wb8 -# option <key> <value> -end-volume - -volume server8 - type protocol/server - subvolumes ra8 ra1 ra2 ra3 ra4 ra5 ra6 ra7 brick-ns - option transport-type tcp # For TCP/IP transport -# option transport-type ib-sdp # For Infiniband transport -# option transport-type ib-verbs # For ib-verbs transport - option client-volume-filename /examples/qa-client.vol - option auth.addr.ra1.allow * # Allow access to "stat8" volume - option auth.addr.ra2.allow * # Allow access to "stat8" volume - option auth.addr.ra3.allow * # Allow access to "stat8" volume - option auth.addr.ra4.allow * # Allow access to "stat8" volume - option auth.addr.ra5.allow * # Allow access to "stat8" volume - option auth.addr.ra6.allow * # Allow access to "stat8" volume - option auth.addr.ra7.allow * # Allow access to "stat8" volume - option auth.addr.ra8.allow * # Allow access to "stat8" volume - option auth.addr.brick-ns.allow * # Allow access to "stat8" volume -end-volume - diff --git a/doc/split-brain.md b/doc/split-brain.md new file mode 100644 index 000000000..b0d938e26 --- /dev/null +++ b/doc/split-brain.md @@ -0,0 +1,251 @@ +Steps to recover from File split-brain. +====================================== + +Quick Start: +============ +1. Get the path of the file that is in split-brain: +> It can be obtained either by +> a) The command `gluster volume heal info split-brain`. +> b) Identify the files for which file operations performed + from the client keep failing with Input/Output error. + +2. Close the applications that opened this file from the mount point. +In case of VMs, they need to be powered-off. + +3. Decide on the correct copy: +> This is done by observing the afr changelog extended attributes of the file on +the bricks using the getfattr command; then identifying the type of split-brain +(data split-brain, metadata split-brain, entry split-brain or split-brain due to +gfid-mismatch); and finally determining which of the bricks contains the 'good copy' +of the file. +> `getfattr -d -m . -e hex <file-path-on-brick>`. +It is also possible that one brick might contain the correct data while the +other might contain the correct metadata. + +4. Reset the relevant extended attribute on the brick(s) that contains the +'bad copy' of the file data/metadata using the setfattr command. +> `setfattr -n <attribute-name> -v <attribute-value> <file-path-on-brick>` + +5. Trigger self-heal on the file by performing lookup from the client: +> `ls -l <file-path-on-gluster-mount>` + +Detailed Instructions for steps 3 through 5: +=========================================== +To understand how to resolve split-brain we need to know how to interpret the +afr changelog extended attributes. + +Execute `getfattr -d -m . -e hex <file-path-on-brick>` + +* Example: +[root@store3 ~]# getfattr -d -e hex -m. brick-a/file.txt +\#file: brick-a/file.txt +security.selinux=0x726f6f743a6f626a6563745f723a66696c655f743a733000 +trusted.afr.vol-client-2=0x000000000000000000000000 +trusted.afr.vol-client-3=0x000000000200000000000000 +trusted.gfid=0x307a5c9efddd4e7c96e94fd4bcdcbd1b + +The extended attributes with `trusted.afr.<volname>-client-<subvolume-index>` +are used by afr to maintain changelog of the file.The values of the +`trusted.afr.<volname>-client-<subvolume-index>` are calculated by the glusterfs +client (fuse or nfs-server) processes. When the glusterfs client modifies a file +or directory, the client contacts each brick and updates the changelog extended +attribute according to the response of the brick. + +'subvolume-index' is nothing but (brick number - 1) in +`gluster volume info <volname>` output. + +* Example: +[root@pranithk-laptop ~]# gluster volume info vol + Volume Name: vol + Type: Distributed-Replicate + Volume ID: 4f2d7849-fbd6-40a2-b346-d13420978a01 + Status: Created + Number of Bricks: 4 x 2 = 8 + Transport-type: tcp + Bricks: + brick-a: pranithk-laptop:/gfs/brick-a + brick-b: pranithk-laptop:/gfs/brick-b + brick-c: pranithk-laptop:/gfs/brick-c + brick-d: pranithk-laptop:/gfs/brick-d + brick-e: pranithk-laptop:/gfs/brick-e + brick-f: pranithk-laptop:/gfs/brick-f + brick-g: pranithk-laptop:/gfs/brick-g + brick-h: pranithk-laptop:/gfs/brick-h + +In the example above: +``` +Brick | Replica set | Brick subvolume index +---------------------------------------------------------------------------- +-/gfs/brick-a | 0 | 0 +-/gfs/brick-b | 0 | 1 +-/gfs/brick-c | 1 | 2 +-/gfs/brick-d | 1 | 3 +-/gfs/brick-e | 2 | 4 +-/gfs/brick-f | 2 | 5 +-/gfs/brick-g | 3 | 6 +-/gfs/brick-h | 3 | 7 +``` + +Each file in a brick maintains the changelog of itself and that of the files +present in all the other bricks in it's replica set as seen by that brick. + +In the example volume given above, all files in brick-a will have 2 entries, +one for itself and the other for the file present in it's replica pair, i.e.brick-b: +trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for itself (brick-a) +trusted.afr.vol-client-1=0x000000000000000000000000 -->changelog for brick-b as seen by brick-a + +Likewise, all files in brick-b will have: +trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for brick-a as seen by brick-b +trusted.afr.vol-client-1=0x000000000000000000000000 -->changelog for itself (brick-b) + +The same can be extended for other replica pairs. + +Interpreting Changelog (roughly pending operation count) Value: +Each extended attribute has a value which is 24 hexa decimal digits. +First 8 digits represent changelog of data. Second 8 digits represent changelog +of metadata. Last 8 digits represent Changelog of directory entries. + +Pictorially representing the same, we have: +``` +0x 000003d7 00000001 00000000 + | | | + | | \_ changelog of directory entries + | \_ changelog of metadata + \ _ changelog of data +``` + + +For Directories metadata and entry changelogs are valid. +For regular files data and metadata changelogs are valid. +For special files like device files etc metadata changelog is valid. +When a file split-brain happens it could be either data split-brain or +meta-data split-brain or both. When a split-brain happens the changelog of the +file would be something like this: + +* Example:(Lets consider both data, metadata split-brain on same file). +[root@pranithk-laptop vol]# getfattr -d -m . -e hex /gfs/brick-?/a +getfattr: Removing leading '/' from absolute path names +\#file: gfs/brick-a/a +trusted.afr.vol-client-0=0x000000000000000000000000 +trusted.afr.vol-client-1=0x000003d70000000100000000 +trusted.gfid=0x80acdbd886524f6fbefa21fc356fed57 +\#file: gfs/brick-b/a +trusted.afr.vol-client-0=0x000003b00000000100000000 +trusted.afr.vol-client-1=0x000000000000000000000000 +trusted.gfid=0x80acdbd886524f6fbefa21fc356fed57 + +###Observations: + +####According to changelog extended attributes on file /gfs/brick-a/a: +The first 8 digits of trusted.afr.vol-client-0 are all +zeros (0x00000000................), and the first 8 digits of +trusted.afr.vol-client-1 are not all zeros (0x000003d7................). +So the changelog on /gfs/brick-a/a implies that some data operations succeeded +on itself but failed on /gfs/brick-b/a. + +The second 8 digits of trusted.afr.vol-client-0 are +all zeros (0x........00000000........), and the second 8 digits of +trusted.afr.vol-client-1 are not all zeros (0x........00000001........). +So the changelog on /gfs/brick-a/a implies that some metadata operations succeeded +on itself but failed on /gfs/brick-b/a. + +####According to Changelog extended attributes on file /gfs/brick-b/a: +The first 8 digits of trusted.afr.vol-client-0 are not all +zeros (0x000003b0................), and the first 8 digits of +trusted.afr.vol-client-1 are all zeros (0x00000000................). +So the changelog on /gfs/brick-b/a implies that some data operations succeeded +on itself but failed on /gfs/brick-a/a. + +The second 8 digits of trusted.afr.vol-client-0 are not +all zeros (0x........00000001........), and the second 8 digits of +trusted.afr.vol-client-1 are all zeros (0x........00000000........). +So the changelog on /gfs/brick-b/a implies that some metadata operations succeeded +on itself but failed on /gfs/brick-a/a. + +Since both the copies have data, metadata changes that are not on the other +file, it is in both data and metadata split-brain. + +Deciding on the correct copy: +----------------------------- +The user may have to inspect stat,getfattr output of the files to decide which +metadata to retain and contents of the file to decide which data to retain. +Continuing with the example above, lets say we want to retain the data +of /gfs/brick-a/a and metadata of /gfs/brick-b/a. + +Resetting the relevant changelogs to resolve the split-brain: +------------------------------------------------------------- +For resolving data-split-brain: +We need to change the changelog extended attributes on the files as if some data +operations succeeded on /gfs/brick-a/a but failed on /gfs/brick-b/a. But +/gfs/brick-b/a should NOT have any changelog which says some data operations +succeeded on /gfs/brick-b/a but failed on /gfs/brick-a/a. We need to reset the +data part of the changelog on trusted.afr.vol-client-0 of /gfs/brick-b/a. + +For resolving metadata-split-brain: +We need to change the changelog extended attributes on the files as if some +metadata operations succeeded on /gfs/brick-b/a but failed on /gfs/brick-a/a. +But /gfs/brick-a/a should NOT have any changelog which says some metadata +operations succeeded on /gfs/brick-a/a but failed on /gfs/brick-b/a. +We need to reset metadata part of the changelog on +trusted.afr.vol-client-1 of /gfs/brick-a/a + +So, the intended changes are: +On /gfs/brick-b/a: +For trusted.afr.vol-client-0 +0x000003b00000000100000000 to 0x000000000000000100000000 +(Note that the metadata part is still not all zeros) +Hence execute +`setfattr -n trusted.afr.vol-client-0 -v 0x000000000000000100000000 /gfs/brick-b/a` + +On /gfs/brick-a/a: +For trusted.afr.vol-client-1 +0x0000000000000000ffffffff to 0x000003d70000000000000000 +(Note that the data part is still not all zeros) +Hence execute +`setfattr -n trusted.afr.vol-client-1 -v 0x000003d70000000000000000 /gfs/brick-a/a` + +Thus after the above operations are done, the changelogs look like this: +[root@pranithk-laptop vol]# getfattr -d -m . -e hex /gfs/brick-?/a +getfattr: Removing leading '/' from absolute path names +\#file: gfs/brick-a/a +trusted.afr.vol-client-0=0x000000000000000000000000 +trusted.afr.vol-client-1=0x000003d70000000000000000 +trusted.gfid=0x80acdbd886524f6fbefa21fc356fed57 + +\#file: gfs/brick-b/a +trusted.afr.vol-client-0=0x000000000000000100000000 +trusted.afr.vol-client-1=0x000000000000000000000000 +trusted.gfid=0x80acdbd886524f6fbefa21fc356fed57 + + +Triggering Self-heal: +--------------------- +Perform `ls -l <file-path-on-gluster-mount>` to trigger healing. + +Fixing Directory entry split-brain: +---------------------------------- +Afr has the ability to conservatively merge different entries in the directories +when there is a split-brain on directory. +If on one brick directory 'd' has entries '1', '2' and has entries '3', '4' on +the other brick then afr will merge all of the entries in the directory to have +'1', '2', '3', '4' entries in the same directory. +(Note: this may result in deleted files to re-appear in case the split-brain +happens because of deletion of files on the directory) +Split-brain resolution needs human intervention when there is at least one entry +which has same file name but different gfid in that directory. +Example: +On brick-a the directory has entries '1' (with gfid g1), '2' and on brick-b +directory has entries '1' (with gfid g2) and '3'. +These kinds of directory split-brains need human intervention to resolve. +The user needs to remove either file '1' on brick-a or the file '1' on brick-b +to resolve the split-brain. In addition, the corresponding gfid-link file also +needs to be removed.The gfid-link files are present in the .glusterfs folder +in the top-level directory of the brick. If the gfid of the file is +0x307a5c9efddd4e7c96e94fd4bcdcbd1b (the trusted.gfid extended attribute got +from the getfattr command earlier),the gfid-link file can be found at +> /gfs/brick-a/.glusterfs/30/7a/307a5c9efddd4e7c96e94fd4bcdcbd1b + +####Word of caution: +Before deleting the gfid-link, we have to ensure that there are no hard links +to the file present on that brick. If hard-links exist,they must be deleted as +well. |
