changes to NSR reconciliation code to add error handling.

Description of chnages added: 1) In recon driver, check for all glfs calls return values. 2) make the driver send back error values to other drivers or to main translator. 3) let the leader retry on errors Change-Id: I050003a819d2314c8fdfd111df465041c30ee6e3 Signed-off-by: Raghavan P <rpichai@redhat.com>
author: Raghavan P <rpichai@redhat.com> 2014-02-19 07:03:26 +0530
committer: Jeff Darcy <jdarcy@redhat.com> 2014-03-03 19:41:32 +0000
commit: c28972ea53cc7cdb91c7aac01754dd7f0b66e1a7 (patch)
tree: fc316e94c6494b282a1179bb97939909e5cbcba0 /xlators/cluster/nsr-server/src
parent: 3bbfebc8dc21c469d47b576069ae137aec4567c9 (diff)
1 files changed, 40 insertions, 3 deletions
diff --git a/xlators/cluster/nsr-server/src/recon_notify.c b/xlators/cluster/nsr-server/src/recon_notify.c
index 7a0de85b1..7397192ae 100644
--- a/xlators/cluster/nsr-server/src/recon_notify.c
+++ b/xlators/cluster/nsr-server/src/recon_notify.c
@@ -120,12 +120,49 @@ nsr_recon_set_leader (xlator_t *this)
         // in the callback (once reconciliation is done),
         // we will unfence the IOs.
         // TBD - error handling later.
-        glfs_lseek(ctx->fd, nsr_recon_xlator_sector_1, SEEK_SET);
+        if (glfs_lseek(ctx->fd, nsr_recon_xlator_sector_1, SEEK_SET) == -1) {
+		gf_log (this->name, GF_LOG_ERROR,
+                        "doing lseek failed\n");
+		return;
+	}
+
         glusterfs_this_set(old);
         gf_log (this->name, GF_LOG_INFO,
                 "Writing to local node to set leader");
-        glfs_write(ctx->fd, &role,
-                   sizeof(role), 0);
+	do {
+		 if (priv->leader != _gf_true) {
+			glusterfs_this_set(old);
+			gf_log (this->name, GF_LOG_ERROR, "no longer leader\n");
+			return;
+		 }
+		 if (glfs_write(ctx->fd, &role, sizeof(role), 0) == -1) {
+			if (errno == EAGAIN) {
+				// Wait for old reconciliation to bail out.
+				glusterfs_this_set(old);
+				gf_log (this->name, GF_LOG_ERROR,
+                        		"write failed with retry. retrying after some time\n");
+				sleep(5);
+				continue;
+			}
+			else{
+				glusterfs_this_set(old);
+				gf_log (this->name, GF_LOG_ERROR,
+                        		"doing write failed\n");
+				// This is because reconciliation has returned with error
+				// because some node has died in between.
+				// What should be done? Either we retry being leader
+				// or hook to CHILD_DOWN notification.
+				// Put that logic later. As of now we will just retry.
+				// This is easier.
+				sleep(5);
+				continue;
+			}
+		} else {
+			glusterfs_this_set(old);
+			gf_log (this->name, GF_LOG_INFO, "doing write with success\n");
+			break;
+		}
+	} while(1);
         glusterfs_this_set(old);
 	gf_log (this->name, GF_LOG_INFO,
                 "glfs_write returned. unfencing IO\n");
author	Raghavan P <rpichai@redhat.com>	2014-02-19 07:03:26 +0530
committer	Jeff Darcy <jdarcy@redhat.com>	2014-03-03 19:41:32 +0000
commit	c28972ea53cc7cdb91c7aac01754dd7f0b66e1a7 (patch)
tree	fc316e94c6494b282a1179bb97939909e5cbcba0 /xlators/cluster/nsr-server/src
parent	3bbfebc8dc21c469d47b576069ae137aec4567c9 (diff)