From 593150979399f7f11e580591eab4b032bb0228ac Mon Sep 17 00:00:00 2001
From: Ravishankar N <ravishankar@redhat.com>
Date: Wed, 16 Oct 2019 13:06:29 +0530
Subject: afr: lock healing changes

Implements lock healing for gluster-block fencing use case.

If mandatory lock is enabled:
- Add domain lock/unlock to afr_lk fop.
- Maintain a list of locks to be healed in afr_private_t.
- Add lock to the list if afr_lk(F_SETLK or F_SETLKW) was sucessful.
- Remove it from the list during afr_lk(F_UNLCK).
- On child_down, mark lock as needing heal on that child. If lock is
lost on quorum no. of bricks, remove it from the list and mark fd bad.
- For fds marked as bad, fail the subsequent fd based fops.
- On parent up, traverse the list and heal the locks IFF the client is
the lk owner and has quorum. (shd does not heal any locks).

updates: #613
Change-Id: I03c46ceaea30f5e6236d5ec13f71d843d827f1bc
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
---
 tests/basic/fencing/afr-lock-heal-advanced.c | 227 ++++++++
 tests/basic/fencing/afr-lock-heal-advanced.t | 104 ++++
 tests/basic/fencing/afr-lock-heal-basic.c    | 182 ++++++
 tests/basic/fencing/afr-lock-heal-basic.t    |  99 ++++
 xlators/cluster/afr/src/afr-common.c         | 818 +++++++++++++++++++++++++--
 xlators/cluster/afr/src/afr-inode-read.c     |   4 +
 xlators/cluster/afr/src/afr-inode-write.c    |  10 +
 xlators/cluster/afr/src/afr-mem-types.h      |   2 +
 xlators/cluster/afr/src/afr-messages.h       |   2 +-
 xlators/cluster/afr/src/afr.c                |   3 +
 xlators/cluster/afr/src/afr.h                |  46 ++
 11 files changed, 1461 insertions(+), 36 deletions(-)
 create mode 100644 tests/basic/fencing/afr-lock-heal-advanced.c
 create mode 100644 tests/basic/fencing/afr-lock-heal-advanced.t
 create mode 100644 tests/basic/fencing/afr-lock-heal-basic.c
 create mode 100644 tests/basic/fencing/afr-lock-heal-basic.t

diff --git a/tests/basic/fencing/afr-lock-heal-advanced.c b/tests/basic/fencing/afr-lock-heal-advanced.c
new file mode 100644
index 00000000000..e202ccd5b29
--- /dev/null
+++ b/tests/basic/fencing/afr-lock-heal-advanced.c
@@ -0,0 +1,227 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+#include <glusterfs/api/glfs.h>
+#include <glusterfs/api/glfs-handles.h>
+
+#define GF_ENFORCE_MANDATORY_LOCK "trusted.glusterfs.enforce-mandatory-lock"
+
+FILE *logfile_fp;
+
+#define LOG_ERR(func, err)                                                     \
+    do {                                                                       \
+        if (!logfile_fp) {                                                     \
+            fprintf(stderr, "%\n%d %s : returned error (%s)\n", __LINE__,      \
+                    func, strerror(err));                                      \
+            fflush(stderr);                                                    \
+        } else {                                                               \
+            fprintf(logfile_fp, "\n%d %s : returned error (%s)\n", __LINE__,   \
+                    func, strerror(err));                                      \
+            fflush(logfile_fp);                                                \
+        }                                                                      \
+    } while (0)
+
+glfs_t *
+setup_client(char *hostname, char *volname, char *log_file)
+{
+    int ret = 0;
+    glfs_t *fs = NULL;
+
+    fs = glfs_new(volname);
+    if (!fs) {
+        fprintf(logfile_fp, "\nglfs_new: returned NULL (%s)\n",
+                strerror(errno));
+        goto error;
+    }
+
+    ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007);
+    if (ret < 0) {
+        fprintf(logfile_fp, "\nglfs_set_volfile_server failed ret:%d (%s)\n",
+                ret, strerror(errno));
+        goto error;
+    }
+
+    ret = glfs_set_logging(fs, log_file, 7);
+    if (ret < 0) {
+        fprintf(logfile_fp, "\nglfs_set_logging failed with ret: %d (%s)\n",
+                ret, strerror(errno));
+        goto error;
+    }
+
+    ret = glfs_init(fs);
+    if (ret < 0) {
+        fprintf(logfile_fp, "\nglfs_init failed with ret: %d (%s)\n", ret,
+                strerror(errno));
+        goto error;
+    }
+
+out:
+    return fs;
+error:
+    return NULL;
+}
+
+glfs_fd_t *
+open_file(glfs_t *fs, char *fname)
+{
+    glfs_fd_t *fd = NULL;
+
+    fd = glfs_creat(fs, fname, O_CREAT, 0644);
+    if (!fd) {
+        LOG_ERR("glfs_creat", errno);
+        goto out;
+    }
+out:
+    return fd;
+}
+
+int
+acquire_mandatory_lock(glfs_t *fs, glfs_fd_t *fd)
+{
+    struct flock lock;
+    int ret = 0;
+
+    /* initialize lock */
+    lock.l_type = F_WRLCK;
+    lock.l_whence = SEEK_SET;
+    lock.l_start = 0;
+    lock.l_len = 100;
+
+    ret = glfs_fsetxattr(fd, GF_ENFORCE_MANDATORY_LOCK, "set", 8, 0);
+    if (ret < 0) {
+        LOG_ERR("glfs_fsetxattr", errno);
+        ret = -1;
+        goto out;
+    }
+
+    /* take a write mandatory lock */
+    ret = glfs_file_lock(fd, F_SETLKW, &lock, GLFS_LK_MANDATORY);
+    if (ret) {
+        LOG_ERR("glfs_file_lock", errno);
+        ret = -1;
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+int
+perform_test(glfs_t *fs, char *file1, char *file2)
+{
+    int ret = 0;
+    glfs_fd_t *fd1 = NULL;
+    glfs_fd_t *fd2 = NULL;
+    char *buf = "0123456789";
+
+    fd1 = open_file(fs, file1);
+    if (!fd1) {
+        ret = -1;
+        goto out;
+    }
+    fd2 = open_file(fs, file2);
+    if (!fd2) {
+        ret = -1;
+        goto out;
+    }
+
+    /* Kill one brick from the .t.*/
+    pause();
+
+    ret = acquire_mandatory_lock(fs, fd1);
+    if (ret) {
+        goto out;
+    }
+    ret = acquire_mandatory_lock(fs, fd2);
+    if (ret) {
+        goto out;
+    }
+
+    /* Bring the brick up and let the locks heal. */
+    pause();
+    /*At this point, the .t would have killed and brought back 2 bricks, marking
+     * the fd bad.*/
+
+    ret = glfs_write(fd1, buf, 10, 0);
+    if (ret > 0) {
+        /* Write is supposed to fail with EBADFD*/
+        LOG_ERR("glfs_write", ret);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (fd1)
+        glfs_close(fd1);
+    if (fd2)
+        glfs_close(fd2);
+    return ret;
+}
+
+static void
+sigusr1_handler(int signo)
+{
+    /*Signal caught. Just continue with the execution.*/
+}
+
+int
+main(int argc, char *argv[])
+{
+    int ret = 0;
+    glfs_t *fs = NULL;
+    char *volname = NULL;
+    char log_file[100];
+    char *hostname = NULL;
+    char *fname1 = NULL;
+    char *fname2 = NULL;
+
+    if (argc != 7) {
+        fprintf(stderr,
+                "Expect following args %s <host> <volname> <file1> <file2> "
+                "<log file "
+                "location> <log_file_suffix>\n",
+                argv[0]);
+        return -1;
+    }
+
+    hostname = argv[1];
+    volname = argv[2];
+    fname1 = argv[3];
+    fname2 = argv[4];
+
+    /*Use SIGUSR1 and pause()as a means of hitting break-points this program
+     *when signalled from the .t test case.*/
+    if (signal(SIGUSR1, sigusr1_handler) == SIG_ERR) {
+        LOG_ERR("SIGUSR1 handler error", errno);
+        exit(EXIT_FAILURE);
+    }
+
+    sprintf(log_file, "%s/%s.%s.%s", argv[5], "lock-heal.c", argv[6], "log");
+    logfile_fp = fopen(log_file, "w");
+    if (!logfile_fp) {
+        fprintf(stderr, "\nfailed to open %s\n", log_file);
+        fflush(stderr);
+        return -1;
+    }
+
+    sprintf(log_file, "%s/%s.%s.%s", argv[5], "glfs-client", argv[6], "log");
+    fs = setup_client(hostname, volname, log_file);
+    if (!fs) {
+        LOG_ERR("setup_client", errno);
+        return -1;
+    }
+
+    ret = perform_test(fs, fname1, fname2);
+
+error:
+    if (fs) {
+        /*glfs_fini(fs)*/;  // glfs fini path is racy and crashes the program
+    }
+
+    fclose(logfile_fp);
+
+    return ret;
+}
diff --git a/tests/basic/fencing/afr-lock-heal-advanced.t b/tests/basic/fencing/afr-lock-heal-advanced.t
new file mode 100644
index 00000000000..8a7a208db29
--- /dev/null
+++ b/tests/basic/fencing/afr-lock-heal-advanced.t
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+function is_gfapi_program_alive()
+{
+        pid=$1
+        ps -p $pid
+        if [ $? -eq 0 ]
+        then
+                echo "Y"
+        else
+                echo "N"
+        fi
+}
+
+function get_active_lock_count {
+    brick=$1
+    sdump=$(generate_brick_statedump $V0 $H0 $brick)
+    lock_count="$(grep ACTIVE $sdump| wc -l)"
+    echo "$lock_count"
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+EXPECT 'Created' volinfo_field $V0 'Status';
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.open-behind off
+TEST $CLI volume set $V0 locks.mandatory-locking forced
+TEST $CLI volume set $V0 enforce-mandatory-lock on
+TEST $CLI volume start $V0;
+EXPECT 'Started' volinfo_field $V0 'Status';
+
+logdir=`gluster --print-logdir`
+TEST build_tester $(dirname $0)/afr-lock-heal-advanced.c -lgfapi -ggdb
+
+#------------------------------------------------------------------------------
+# Use more than 1 fd from same client so that list_for_each_* loops are executed more than once.
+$(dirname $0)/afr-lock-heal-advanced $H0 $V0 "/FILE1" "/FILE2" $logdir C1&
+client_pid=$!
+TEST [ $client_pid ]
+
+TEST sleep 5 # By now, the client would  have opened an fd on FILE1 and FILE2 and waiting for a SIGUSR1.
+EXPECT "Y" is_gfapi_program_alive $client_pid
+
+# Kill brick-3 and let client-1 take lock on both files.
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST kill -SIGUSR1 $client_pid
+# If program is still alive, glfs_file_lock() was a success.
+EXPECT "Y" is_gfapi_program_alive $client_pid
+
+# Check lock is present on brick-1 and brick-2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}1
+
+# Restart brick-3 and check that the lock has healed on it.
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
+TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd.
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}2
+
+#------------------------------------------------------------------------------
+# Kill same brick before heal completes the first time and check it completes the second time.
+TEST $CLI volume set $V0 delay-gen locks
+TEST $CLI volume set $V0 delay-gen.delay-duration 5000000
+TEST $CLI volume set $V0 delay-gen.delay-percentage 100
+TEST $CLI volume set $V0 delay-gen.enable finodelk
+
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST $CLI volume reset $V0 delay-gen
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}0
+
+#------------------------------------------------------------------------------
+# Kill 2 bricks and bring it back. The fds must be marked bad.
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+
+# TODO: `gluster v statedump $V0 client localhost:$client_pid` is not working,
+# so sleep for 20 seconds for the client to connect to connect to the bricks.
+TEST sleep $CHILD_UP_TIMEOUT
+
+# Try to write to FILE1 from the .c; it must fail.
+TEST kill -SIGUSR1 $client_pid
+wait $client_pid
+ret=$?
+TEST [ $ret == 0 ]
+
+cleanup_tester $(dirname $0)/afr-lock-heal-advanced
+cleanup;
diff --git a/tests/basic/fencing/afr-lock-heal-basic.c b/tests/basic/fencing/afr-lock-heal-basic.c
new file mode 100644
index 00000000000..768c9e57181
--- /dev/null
+++ b/tests/basic/fencing/afr-lock-heal-basic.c
@@ -0,0 +1,182 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+#include <glusterfs/api/glfs.h>
+#include <glusterfs/api/glfs-handles.h>
+
+#define GF_ENFORCE_MANDATORY_LOCK "trusted.glusterfs.enforce-mandatory-lock"
+
+FILE *logfile_fp;
+
+#define LOG_ERR(func, err)                                                     \
+    do {                                                                       \
+        if (!logfile_fp) {                                                     \
+            fprintf(stderr, "%\n%d %s : returned error (%s)\n", __LINE__,      \
+                    func, strerror(err));                                      \
+            fflush(stderr);                                                    \
+        } else {                                                               \
+            fprintf(logfile_fp, "\n%d %s : returned error (%s)\n", __LINE__,   \
+                    func, strerror(err));                                      \
+            fflush(logfile_fp);                                                \
+        }                                                                      \
+    } while (0)
+
+glfs_t *
+setup_client(char *hostname, char *volname, char *log_file)
+{
+    int ret = 0;
+    glfs_t *fs = NULL;
+
+    fs = glfs_new(volname);
+    if (!fs) {
+        fprintf(logfile_fp, "\nglfs_new: returned NULL (%s)\n",
+                strerror(errno));
+        goto error;
+    }
+
+    ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007);
+    if (ret < 0) {
+        fprintf(logfile_fp, "\nglfs_set_volfile_server failed ret:%d (%s)\n",
+                ret, strerror(errno));
+        goto error;
+    }
+
+    ret = glfs_set_logging(fs, log_file, 7);
+    if (ret < 0) {
+        fprintf(logfile_fp, "\nglfs_set_logging failed with ret: %d (%s)\n",
+                ret, strerror(errno));
+        goto error;
+    }
+
+    ret = glfs_init(fs);
+    if (ret < 0) {
+        fprintf(logfile_fp, "\nglfs_init failed with ret: %d (%s)\n", ret,
+                strerror(errno));
+        goto error;
+    }
+
+out:
+    return fs;
+error:
+    return NULL;
+}
+
+int
+acquire_mandatory_lock(glfs_t *fs, char *fname)
+{
+    struct flock lock;
+    int ret = 0;
+    glfs_fd_t *fd = NULL;
+
+    fd = glfs_creat(fs, fname, O_CREAT, 0644);
+    if (!fd) {
+        if (errno != EEXIST) {
+            LOG_ERR("glfs_creat", errno);
+            ret = -1;
+            goto out;
+        }
+        fd = glfs_open(fs, fname, O_RDWR | O_NONBLOCK);
+        if (!fd) {
+            LOG_ERR("glfs_open", errno);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    /* initialize lock */
+    lock.l_type = F_WRLCK;
+    lock.l_whence = SEEK_SET;
+    lock.l_start = 0;
+    lock.l_len = 100;
+
+    ret = glfs_fsetxattr(fd, GF_ENFORCE_MANDATORY_LOCK, "set", 8, 0);
+    if (ret < 0) {
+        LOG_ERR("glfs_fsetxattr", errno);
+        ret = -1;
+        goto out;
+    }
+
+    pause();
+
+    /* take a write mandatory lock */
+    ret = glfs_file_lock(fd, F_SETLKW, &lock, GLFS_LK_MANDATORY);
+    if (ret) {
+        LOG_ERR("glfs_file_lock", errno);
+        goto out;
+    }
+
+    pause();
+
+out:
+    if (fd) {
+        glfs_close(fd);
+    }
+
+    return ret;
+}
+
+static void
+sigusr1_handler(int signo)
+{
+    /*Signal caught. Just continue with the execution.*/
+}
+
+int
+main(int argc, char *argv[])
+{
+    int ret = 0;
+    glfs_t *fs = NULL;
+    char *volname = NULL;
+    char log_file[100];
+    char *hostname = NULL;
+    char *fname = NULL;
+
+    if (argc != 6) {
+        fprintf(stderr,
+                "Expect following args %s <host> <volname> <file> <log file "
+                "location> <log_file_suffix>\n",
+                argv[0]);
+        return -1;
+    }
+
+    hostname = argv[1];
+    volname = argv[2];
+    fname = argv[3];
+
+    /*Use SIGUSR1 and pause()as a means of hitting break-points this program
+     *when signalled from the .t test case.*/
+    if (signal(SIGUSR1, sigusr1_handler) == SIG_ERR) {
+        LOG_ERR("SIGUSR1 handler error", errno);
+        exit(EXIT_FAILURE);
+    }
+
+    sprintf(log_file, "%s/%s.%s.%s", argv[4], "lock-heal-basic.c", argv[5],
+            "log");
+    logfile_fp = fopen(log_file, "w");
+    if (!logfile_fp) {
+        fprintf(stderr, "\nfailed to open %s\n", log_file);
+        fflush(stderr);
+        return -1;
+    }
+
+    sprintf(log_file, "%s/%s.%s.%s", argv[4], "glfs-client", argv[5], "log");
+    fs = setup_client(hostname, volname, log_file);
+    if (!fs) {
+        LOG_ERR("setup_client", errno);
+        return -1;
+    }
+
+    ret = acquire_mandatory_lock(fs, fname);
+
+error:
+    if (fs) {
+        /*glfs_fini(fs)*/;  // glfs fini path is racy and crashes the program
+    }
+
+    fclose(logfile_fp);
+
+    return ret;
+}
diff --git a/tests/basic/fencing/afr-lock-heal-basic.t b/tests/basic/fencing/afr-lock-heal-basic.t
new file mode 100644
index 00000000000..5ac05c7aec6
--- /dev/null
+++ b/tests/basic/fencing/afr-lock-heal-basic.t
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+function is_gfapi_program_alive()
+{
+        pid=$1
+        ps -p $pid
+        if [ $? -eq 0 ]
+        then
+                echo "Y"
+        else
+                echo "N"
+        fi
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+EXPECT 'Created' volinfo_field $V0 'Status';
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.open-behind off
+TEST $CLI volume set $V0 locks.mandatory-locking forced
+TEST $CLI volume set $V0 enforce-mandatory-lock on
+TEST $CLI volume start $V0;
+EXPECT 'Started' volinfo_field $V0 'Status';
+
+logdir=`gluster --print-logdir`
+TEST build_tester $(dirname $0)/afr-lock-heal-basic.c -lgfapi -ggdb
+
+$(dirname $0)/afr-lock-heal-basic $H0 $V0 "/FILE" $logdir C1&
+client1_pid=$!
+TEST [ $client1_pid ]
+
+$(dirname $0)/afr-lock-heal-basic $H0 $V0 "/FILE" $logdir C2&
+client2_pid=$!
+TEST [ $client2_pid ]
+
+TEST sleep 5 # By now, the 2 clients would  have opened an fd on FILE and waiting for a SIGUSR1.
+EXPECT "Y" is_gfapi_program_alive $client1_pid
+EXPECT "Y" is_gfapi_program_alive $client2_pid
+
+# Kill brick-3 and let client-1 take lock on the file.
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST kill -SIGUSR1 $client1_pid
+# If program is still alive, glfs_file_lock() was a success.
+EXPECT "Y" is_gfapi_program_alive $client1_pid
+
+# Check lock is present on brick-1 and brick-2
+b1_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}0)
+b2_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}1)
+c1_lock_on_b1="$(grep ACTIVE $b1_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+c1_lock_on_b2="$(grep ACTIVE $b2_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+TEST [ "$c1_lock_on_b1" == "$c1_lock_on_b2" ]
+
+# Restart brick-3 and check that the lock has healed on it.
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
+TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd. Also wait for lock heal.
+
+b3_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}2)
+c1_lock_on_b3="$(grep ACTIVE $b3_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+TEST [ "$c1_lock_on_b1" == "$c1_lock_on_b3" ]
+
+# Kill brick-1 and let client-2 preempt the lock on bricks 2 and 3.
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST kill -SIGUSR1 $client2_pid
+# If program is still alive, glfs_file_lock() was a success.
+EXPECT "Y" is_gfapi_program_alive $client2_pid
+
+# Restart brick-1 and let lock healing complete.
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd. Also wait for lock heal.
+
+# Check that all bricks now have locks from client 2 only.
+b1_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}0)
+b2_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}1)
+b3_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}2)
+c2_lock_on_b1="$(grep ACTIVE $b1_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+c2_lock_on_b2="$(grep ACTIVE $b2_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+c2_lock_on_b3="$(grep ACTIVE $b3_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+TEST [ "$c2_lock_on_b1" == "$c2_lock_on_b2" ]
+TEST [ "$c2_lock_on_b1" == "$c2_lock_on_b3" ]
+TEST [ "$c2_lock_on_b1" != "$c1_lock_on_b1" ]
+
+#Let the client programs run and exit.
+TEST kill -SIGUSR1 $client1_pid
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "N" is_gfapi_program_alive $client1_pid
+TEST kill -SIGUSR1 $client2_pid
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "N" is_gfapi_program_alive $client2_pid
+
+cleanup_tester $(dirname $0)/afr-lock-heal-basic
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 4b22af7cb3f..07bf53a1941 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -45,6 +45,21 @@ afr_quorum_errno(afr_private_t *priv)
     return ENOTCONN;
 }
 
+static void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+                         unsigned char *replies)
+{
+    int i = 0;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+            replies[i] = 1;
+        } else {
+            replies[i] = 0;
+        }
+    }
+}
+
 int
 afr_fav_child_reset_sink_xattrs(void *opaque);
 
@@ -54,6 +69,581 @@ afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque);
 static void
 afr_discover_done(call_frame_t *frame, xlator_t *this);
 
+int
+afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int op_ret, int op_errno, dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+    afr_private_t *priv = this->private;
+    int i = (long)cookie;
+
+    local->cont.lk.dom_lock_op_ret[i] = op_ret;
+    local->cont.lk.dom_lock_op_errno[i] = op_errno;
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+               "%s: Failed to acquire %s on %s",
+               uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM,
+               priv->children[i]->name);
+    } else {
+        local->cont.lk.dom_locked_nodes[i] = 1;
+    }
+
+    syncbarrier_wake(&local->barrier);
+
+    return 0;
+}
+
+int
+afr_dom_lock_acquire(call_frame_t *frame)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+    int i = 0;
+
+    priv = frame->this->private;
+    local = frame->local;
+    local->cont.lk.dom_locked_nodes = GF_CALLOC(
+        priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+        gf_afr_mt_char);
+    if (!local->cont.lk.dom_locked_nodes) {
+        return -ENOMEM;
+    }
+    local->cont.lk.dom_lock_op_ret = GF_CALLOC(
+        priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret),
+        gf_afr_mt_int32_t);
+    if (!local->cont.lk.dom_lock_op_ret) {
+        return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+    }
+    local->cont.lk.dom_lock_op_errno = GF_CALLOC(
+        priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno),
+        gf_afr_mt_int32_t);
+    if (!local->cont.lk.dom_lock_op_errno) {
+        return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+    }
+    flock.l_type = F_WRLCK;
+
+    AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+              local->fd, F_SETLK, &flock, NULL);
+
+    if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL))
+        goto blocking_lock;
+
+    /*If any of the bricks returned EAGAIN, we still need blocking locks.*/
+    if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) !=
+        priv->child_count) {
+        for (i = 0; i < priv->child_count; i++) {
+            if (local->cont.lk.dom_lock_op_ret[i] == -1 &&
+                local->cont.lk.dom_lock_op_errno[i] == EAGAIN)
+                goto blocking_lock;
+        }
+    }
+
+    return 0;
+
+blocking_lock:
+    afr_dom_lock_release(frame);
+    AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+              local->fd, F_SETLKW, &flock, NULL);
+    if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) {
+        afr_dom_lock_release(frame);
+        return -afr_quorum_errno(priv);
+    }
+
+    return 0;
+}
+
+int
+afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int op_ret, int op_errno, dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+    afr_private_t *priv = this->private;
+    int i = (long)cookie;
+
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+               "%s: Failed to release %s on %s", local->loc.path,
+               AFR_LK_HEAL_DOM, priv->children[i]->name);
+    }
+    local->cont.lk.dom_locked_nodes[i] = 0;
+
+    syncbarrier_wake(&local->barrier);
+
+    return 0;
+}
+
+void
+afr_dom_lock_release(call_frame_t *frame)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    unsigned char *locked_on = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+
+    local = frame->local;
+    priv = frame->this->private;
+    locked_on = local->cont.lk.dom_locked_nodes;
+    if (AFR_COUNT(locked_on, priv->child_count) == 0)
+        return;
+    flock.l_type = F_UNLCK;
+
+    AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk,
+               AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL);
+
+    return;
+}
+
+static void
+afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info)
+{
+    if (!info)
+        return;
+    if (info->xdata_req)
+        dict_unref(info->xdata_req);
+    if (info->fd)
+        fd_unref(info->fd);
+    GF_FREE(info->locked_nodes);
+    GF_FREE(info->child_up_event_gen);
+    GF_FREE(info->child_down_event_gen);
+    GF_FREE(info);
+}
+
+static int
+afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = this->private;
+    afr_local_t *local = frame->local;
+    afr_lk_heal_info_t *info = NULL;
+    afr_fd_ctx_t *fd_ctx = NULL;
+    int ret = -ENOMEM;
+
+    info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t);
+    if (!info) {
+        goto cleanup;
+    }
+    INIT_LIST_HEAD(&info->pos);
+    info->fd = fd_ref(local->fd);
+    info->cmd = local->cont.lk.cmd;
+    info->pid = frame->root->pid;
+    info->flock = local->cont.lk.user_flock;
+    info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL);
+    if (!info->xdata_req) {
+        goto cleanup;
+    }
+    info->lk_owner = frame->root->lk_owner;
+    info->locked_nodes = GF_MALLOC(
+        sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char);
+    if (!info->locked_nodes) {
+        goto cleanup;
+    }
+    memcpy(info->locked_nodes, local->cont.lk.locked_nodes,
+           sizeof(*info->locked_nodes) * priv->child_count);
+    info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen),
+                                         priv->child_count, gf_afr_mt_int32_t);
+    if (!info->child_up_event_gen) {
+        goto cleanup;
+    }
+    info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen),
+                                           priv->child_count,
+                                           gf_afr_mt_int32_t);
+    if (!info->child_down_event_gen) {
+        goto cleanup;
+    }
+
+    LOCK(&local->fd->lock);
+    {
+        fd_ctx = __afr_fd_ctx_get(local->fd, this);
+        if (fd_ctx)
+            fd_ctx->lk_heal_info = info;
+    }
+    UNLOCK(&local->fd->lock);
+    if (!fd_ctx) {
+        goto cleanup;
+    }
+
+    LOCK(&priv->lock);
+    {
+        list_add_tail(&info->pos, &priv->saved_locks);
+    }
+    UNLOCK(&priv->lock);
+
+    return 0;
+cleanup:
+    gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+           "%s: Failed to add lock to healq",
+           uuid_utoa(local->fd->inode->gfid));
+    if (info) {
+        afr_lk_heal_info_cleanup(info);
+        if (fd_ctx) {
+            LOCK(&local->fd->lock);
+            {
+                fd_ctx->lk_heal_info = NULL;
+            }
+            UNLOCK(&local->fd->lock);
+        }
+    }
+    return ret;
+}
+
+static int
+afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this)
+{
+    afr_private_t *priv = this->private;
+    struct gf_flock flock = local->cont.lk.user_flock;
+    afr_lk_heal_info_t *info = NULL;
+    afr_fd_ctx_t *fd_ctx = NULL;
+    int ret = -EINVAL;
+
+    fd_ctx = afr_fd_ctx_get(local->fd, this);
+    if (!fd_ctx || !fd_ctx->lk_heal_info) {
+        goto out;
+    }
+
+    info = fd_ctx->lk_heal_info;
+    if ((info->flock.l_start != flock.l_start) ||
+        (info->flock.l_whence != flock.l_whence) ||
+        (info->flock.l_len != flock.l_len)) {
+        /*TODO: Compare lkowners too.*/
+        goto out;
+    }
+
+    LOCK(&priv->lock);
+    {
+        list_del(&fd_ctx->lk_heal_info->pos);
+    }
+    UNLOCK(&priv->lock);
+
+    afr_lk_heal_info_cleanup(info);
+    fd_ctx->lk_heal_info = NULL;
+    ret = 0;
+out:
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+               "%s: Failed to remove lock from healq",
+               uuid_utoa(local->fd->inode->gfid));
+    return ret;
+}
+
+int
+afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+                  dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+    int i = (long)cookie;
+
+    local->replies[i].valid = 1;
+    local->replies[i].op_ret = op_ret;
+    local->replies[i].op_errno = op_errno;
+    if (op_ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+               "Failed to heal lock on child %d for %s", i,
+               uuid_utoa(local->fd->inode->gfid));
+    }
+    syncbarrier_wake(&local->barrier);
+    return 0;
+}
+
+int
+afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+    int i = (long)cookie;
+
+    local->replies[i].valid = 1;
+    local->replies[i].op_ret = op_ret;
+    local->replies[i].op_errno = op_errno;
+    if (op_ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+               "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid));
+    } else {
+        local->cont.lk.getlk_rsp[i] = *lock;
+    }
+
+    syncbarrier_wake(&local->barrier);
+    return 0;
+}
+
+static gf_boolean_t
+afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv,
+                        afr_lk_heal_info_t *info)
+{
+    int i = 0;
+    afr_local_t *local = frame->local;
+    struct gf_flock flock = {
+        0,
+    };
+    gf_boolean_t ret = _gf_true;
+    char *wind_on = alloca0(priv->child_count);
+    unsigned char *success_replies = alloca0(priv->child_count);
+    local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp),
+                                         priv->child_count, gf_afr_mt_gf_lock);
+
+    flock = info->flock;
+    for (i = 0; i < priv->child_count; i++) {
+        if (info->locked_nodes[i])
+            wind_on[i] = 1;
+    }
+
+    AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock,
+               info->xdata_req);
+
+    afr_fill_success_replies(local, priv, success_replies);
+    if (AFR_COUNT(success_replies, priv->child_count) == 0) {
+        ret = _gf_false;
+        goto out;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid || local->replies[i].op_ret != 0)
+            continue;
+        if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK)
+            continue;
+        /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/
+        if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner,
+                             &info->lk_owner)) {
+            ret = _gf_false;
+            break;
+        }
+    }
+out:
+    afr_local_replies_wipe(local, priv);
+    GF_FREE(local->cont.lk.getlk_rsp);
+    local->cont.lk.getlk_rsp = NULL;
+    return ret;
+}
+
+static void
+afr_mark_fd_bad(fd_t *fd, xlator_t *this)
+{
+    afr_fd_ctx_t *fd_ctx = NULL;
+
+    if (!fd)
+        return;
+    LOCK(&fd->lock);
+    {
+        fd_ctx = __afr_fd_ctx_get(fd, this);
+        if (fd_ctx) {
+            fd_ctx->is_fd_bad = _gf_true;
+            fd_ctx->lk_heal_info = NULL;
+        }
+    }
+    UNLOCK(&fd->lock);
+}
+
+static void
+afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info)
+{
+    LOCK(&priv->lock);
+    {
+        list_del(&info->pos);
+        list_add_tail(&info->pos, &priv->lk_healq);
+    }
+    UNLOCK(&priv->lock);
+}
+
+static void
+afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv,
+                 afr_lk_heal_info_t *info)
+{
+    int i = 0;
+    int op_errno = 0;
+    int32_t *current_event_gen = NULL;
+    afr_local_t *local = frame->local;
+    xlator_t *this = frame->this;
+    char *wind_on = alloca0(priv->child_count);
+    gf_boolean_t retry = _gf_true;
+
+    frame->root->pid = info->pid;
+    lk_owner_copy(&frame->root->lk_owner, &info->lk_owner);
+
+    op_errno = -afr_dom_lock_acquire(frame);
+    if ((op_errno != 0)) {
+        goto release;
+    }
+
+    if (!afr_does_lk_owner_match(frame, priv, info)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM,
+               "Ignoring lock heal for %s since lk-onwers mismatch. "
+               "Lock possibly pre-empted by another client.",
+               uuid_utoa(info->fd->inode->gfid));
+        goto release;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (info->locked_nodes[i])
+            continue;
+        wind_on[i] = 1;
+    }
+
+    current_event_gen = alloca(priv->child_count);
+    memcpy(current_event_gen, info->child_up_event_gen,
+           priv->child_count * sizeof *current_event_gen);
+    AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd,
+               &info->flock, info->xdata_req);
+
+    LOCK(&priv->lock);
+    {
+        for (i = 0; i < priv->child_count; i++) {
+            if (!wind_on[i])
+                continue;
+            if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) {
+                continue;
+            }
+
+            if ((current_event_gen[i] == info->child_up_event_gen[i]) &&
+                (current_event_gen[i] > info->child_down_event_gen[i])) {
+                info->locked_nodes[i] = 1;
+                retry = _gf_false;
+                list_del_init(&info->pos);
+                list_add_tail(&info->pos, &priv->saved_locks);
+            } else {
+                /*We received subsequent child up/down events while heal was in
+                 * progress; don't mark child as healed. Attempt again on the
+                 * new child up*/
+                gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM,
+                       "Event gen mismatch: skipped healing lock on child %d "
+                       "for %s.",
+                       i, uuid_utoa(info->fd->inode->gfid));
+            }
+        }
+    }
+    UNLOCK(&priv->lock);
+
+release:
+    afr_dom_lock_release(frame);
+    if (retry)
+        afr_add_lock_to_lkhealq(priv, info);
+    return;
+}
+
+static int
+afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+    STACK_DESTROY(frame->root);
+    return 0;
+}
+
+static int
+afr_lock_heal(void *opaque)
+{
+    call_frame_t *frame = (call_frame_t *)opaque;
+    call_frame_t *iter_frame = NULL;
+    xlator_t *this = frame->this;
+    afr_private_t *priv = this->private;
+    afr_lk_heal_info_t *info = NULL;
+    afr_lk_heal_info_t *tmp = NULL;
+    struct list_head healq = {
+        0,
+    };
+    int ret = 0;
+
+    iter_frame = afr_copy_frame(frame);
+    if (!iter_frame) {
+        return ENOMEM;
+    }
+
+    INIT_LIST_HEAD(&healq);
+    LOCK(&priv->lock);
+    {
+        list_splice_init(&priv->lk_healq, &healq);
+    }
+    UNLOCK(&priv->lock);
+
+    list_for_each_entry_safe(info, tmp, &healq, pos)
+    {
+        GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) <
+                   priv->child_count));
+        ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd);
+        afr_lock_heal_do(iter_frame, priv, info);
+        AFR_STACK_RESET(iter_frame);
+        if (iter_frame->local == NULL) {
+            ret = ENOTCONN;
+            gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN,
+                   AFR_MSG_LK_HEAL_DOM,
+                   "Aborting processing of lk_healq."
+                   "Healing will be reattempted on next child up for locks "
+                   "that are still in quorum.");
+            LOCK(&priv->lock);
+            {
+                list_add_tail(&healq, &priv->lk_healq);
+            }
+            UNLOCK(&priv->lock);
+            break;
+        }
+    }
+
+    AFR_STACK_DESTROY(iter_frame);
+    return ret;
+}
+
+static int
+__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child)
+{
+    int ret = 0;
+    call_frame_t *frame = NULL;
+    afr_lk_heal_info_t *info = NULL;
+    afr_lk_heal_info_t *tmp = NULL;
+
+    if (priv->shd.iamshd)
+        return 0;
+
+    list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+    {
+        info->child_up_event_gen[child] = priv->event_generation;
+        list_del_init(&info->pos);
+        list_add_tail(&info->pos, &priv->lk_healq);
+    }
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame)
+        return -1;
+
+    ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame,
+                       frame);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM,
+               "Failed to launch lock heal synctask");
+
+    return ret;
+}
+
+static int
+__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child)
+{
+    afr_lk_heal_info_t *info = NULL;
+    afr_lk_heal_info_t *tmp = NULL;
+
+    if (priv->shd.iamshd)
+        return 0;
+    list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+    {
+        info->child_down_event_gen[child] = priv->event_generation;
+        if (info->locked_nodes[child] == 1)
+            info->locked_nodes[child] = 0;
+        if (!afr_has_quorum(info->locked_nodes, this, NULL)) {
+            /* Since the lock was lost on quorum no. of nodes, we should
+             * not attempt to heal it anymore. Some other client could have
+             * acquired the lock, modified data and released it and this
+             * client wouldn't know about it if we heal it.*/
+            afr_mark_fd_bad(info->fd, this);
+            list_del(&info->pos);
+            afr_lk_heal_info_cleanup(info);
+            /* We're not winding an unlock on the node where the lock is still
+             * present because when fencing logic switches over to the new
+             * client (since we marked the fd bad), it should preempt any
+             * existing lock. */
+        }
+    }
+    return 0;
+}
+
 gf_boolean_t
 afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
                               int32_t *op_errno)
@@ -68,6 +658,19 @@ afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
     return _gf_true;
 }
 
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata)
+{
+    int ret = 0;
+    uint32_t lk_mode = GF_LK_ADVISORY;
+
+    ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode);
+    if (!ret && lk_mode == GF_LK_MANDATORY)
+        return _gf_true;
+
+    return _gf_false;
+}
+
 call_frame_t *
 afr_copy_frame(call_frame_t *base)
 {
@@ -1224,18 +1827,6 @@ refresh_done:
     return 0;
 }
 
-static void
-afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
-                         unsigned char *replies)
-{
-    int i = 0;
-
-    for (i = 0; i < priv->child_count; i++) {
-        if (local->replies[i].valid && local->replies[i].op_ret == 0)
-            replies[i] = 1;
-    }
-}
-
 int
 afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
 {
@@ -2049,6 +2640,9 @@ afr_local_cleanup(afr_local_t *local, xlator_t *this)
 
     { /* lk */
         GF_FREE(local->cont.lk.locked_nodes);
+        GF_FREE(local->cont.lk.dom_locked_nodes);
+        GF_FREE(local->cont.lk.dom_lock_op_ret);
+        GF_FREE(local->cont.lk.dom_lock_op_errno);
     }
 
     { /* create */
@@ -3451,8 +4045,18 @@ out:
 }
 
 void
-_afr_cleanup_fd_ctx(afr_fd_ctx_t *fd_ctx)
+_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx)
 {
+    afr_private_t *priv = this->private;
+
+    if (fd_ctx->lk_heal_info) {
+        LOCK(&priv->lock);
+        {
+            list_del(&fd_ctx->lk_heal_info->pos);
+        }
+        afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info);
+        fd_ctx->lk_heal_info = NULL;
+    }
     GF_FREE(fd_ctx->opened_on);
     GF_FREE(fd_ctx);
     return;
@@ -3472,7 +4076,7 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd)
     fd_ctx = (afr_fd_ctx_t *)(long)ctx;
 
     if (fd_ctx) {
-        _afr_cleanup_fd_ctx(fd_ctx);
+        _afr_cleanup_fd_ctx(this, fd_ctx);
     }
 
 out:
@@ -3565,13 +4169,14 @@ __afr_fd_ctx_set(xlator_t *this, fd_t *fd)
     }
 
     fd_ctx->readdir_subvol = -1;
+    fd_ctx->lk_heal_info = NULL;
 
     ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx);
     if (ret)
         gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd);
 out:
     if (ret && fd_ctx)
-        _afr_cleanup_fd_ctx(fd_ctx);
+        _afr_cleanup_fd_ctx(this, fd_ctx);
     return ret;
 }
 
@@ -3694,6 +4299,7 @@ afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
     call_stub_t *stub = NULL;
     int op_errno = ENOMEM;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     local = AFR_FRAME_INIT(frame, op_errno);
     if (!local)
         goto out;
@@ -4230,9 +4836,9 @@ out:
 }
 
 static int32_t
-afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
-                   loc_t *loc, fd_t *fd, int32_t cmd, struct gf_flock *flock,
-                   dict_t *xdata)
+afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+                   const char *volume, loc_t *loc, fd_t *fd, int32_t cmd,
+                   struct gf_flock *flock, dict_t *xdata)
 {
     afr_local_t *local = NULL;
     int32_t op_errno = ENOMEM;
@@ -4244,8 +4850,10 @@ afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
     local->op = fop;
     if (loc)
         loc_copy(&local->loc, loc);
-    if (fd)
+    if (fd && (flock->l_type != F_UNLCK)) {
+        AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
         local->fd = fd_ref(fd);
+    }
 
     local->cont.inodelk.volume = gf_strdup(volume);
     if (!local->cont.inodelk.volume) {
@@ -4274,8 +4882,8 @@ int32_t
 afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
             int32_t cmd, struct gf_flock *flock, dict_t *xdata)
 {
-    afr_handle_inodelk(frame, GF_FOP_INODELK, volume, loc, NULL, cmd, flock,
-                       xdata);
+    afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd,
+                       flock, xdata);
     return 0;
 }
 
@@ -4283,15 +4891,16 @@ int32_t
 afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
              int32_t cmd, struct gf_flock *flock, dict_t *xdata)
 {
-    afr_handle_inodelk(frame, GF_FOP_FINODELK, volume, NULL, fd, cmd, flock,
-                       xdata);
+    afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd,
+                       flock, xdata);
     return 0;
 }
 
 static int
-afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
-                   loc_t *loc, fd_t *fd, const char *basename, entrylk_cmd cmd,
-                   entrylk_type type, dict_t *xdata)
+afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+                   const char *volume, loc_t *loc, fd_t *fd,
+                   const char *basename, entrylk_cmd cmd, entrylk_type type,
+                   dict_t *xdata)
 {
     afr_local_t *local = NULL;
     int32_t op_errno = ENOMEM;
@@ -4303,8 +4912,10 @@ afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
     local->op = fop;
     if (loc)
         loc_copy(&local->loc, loc);
-    if (fd)
+    if (fd && (cmd != ENTRYLK_UNLOCK)) {
+        AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
         local->fd = fd_ref(fd);
+    }
     local->cont.entrylk.cmd = cmd;
     local->cont.entrylk.in_cmd = cmd;
     local->cont.entrylk.type = type;
@@ -4331,8 +4942,8 @@ afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
             const char *basename, entrylk_cmd cmd, entrylk_type type,
             dict_t *xdata)
 {
-    afr_handle_entrylk(frame, GF_FOP_ENTRYLK, volume, loc, NULL, basename, cmd,
-                       type, xdata);
+    afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename,
+                       cmd, type, xdata);
     return 0;
 }
 
@@ -4341,8 +4952,8 @@ afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
              const char *basename, entrylk_cmd cmd, entrylk_type type,
              dict_t *xdata)
 {
-    afr_handle_entrylk(frame, GF_FOP_FENTRYLK, volume, NULL, fd, basename, cmd,
-                       type, xdata);
+    afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename,
+                       cmd, type, xdata);
     return 0;
 }
 
@@ -4460,9 +5071,10 @@ afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
     }
 
     call_count = afr_frame_return(frame);
-    if (call_count == 0)
+    if (call_count == 0) {
         AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
                          local->xdata_rsp);
+    }
 
     return 0;
 }
@@ -4560,12 +5172,134 @@ afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
     return 0;
 }
 
+int
+afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+    return 0;
+}
+
+int
+afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+                    dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int child_index = -1;
+
+    local = frame->local;
+    child_index = (long)cookie;
+    afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+    if (op_ret == 0) {
+        local->op_ret = 0;
+        local->op_errno = 0;
+        local->cont.lk.locked_nodes[child_index] = 1;
+        local->cont.lk.ret_flock = *lock;
+    }
+    syncbarrier_wake(&local->barrier);
+    return 0;
+}
+
+int
+afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+                      dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+    afr_private_t *priv = this->private;
+    int child_index = (long)cookie;
+
+    if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+               "gfid=%s: unlock failed on subvolume %s "
+               "with lock owner %s",
+               uuid_utoa(local->fd->inode->gfid),
+               priv->children[child_index]->name,
+               lkowner_utoa(&frame->root->lk_owner));
+    }
+    return 0;
+}
+int
+afr_lk_transaction(void *opaque)
+{
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    char *wind_on = NULL;
+    int op_errno = 0;
+    int i = 0;
+    int ret = 0;
+
+    frame = (call_frame_t *)opaque;
+    local = frame->local;
+    this = frame->this;
+    priv = this->private;
+    wind_on = alloca0(priv->child_count);
+
+    if (priv->arbiter_count || priv->child_count != 3) {
+        op_errno = ENOTSUP;
+        gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+               "%s: Lock healing supported only for replica 3 volumes.",
+               uuid_utoa(local->fd->inode->gfid));
+        goto err;
+    }
+
+    op_errno = -afr_dom_lock_acquire(frame);  // Released during
+                                              // AFR_STACK_UNWIND
+    if (op_errno != 0) {
+        goto err;
+    }
+    if (priv->quorum_count &&
+        !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) {
+        op_errno = afr_final_errno(local, priv);
+        goto err;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i])
+            wind_on[i] = 1;
+    }
+    AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd,
+               local->cont.lk.cmd, &local->cont.lk.user_flock,
+               local->xdata_req);
+
+    if (priv->quorum_count &&
+        !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+        local->op_ret = -1;
+        local->op_errno = afr_final_errno(local, priv);
+        goto unlock;
+    } else {
+        if (local->cont.lk.user_flock.l_type == F_UNLCK)
+            ret = afr_remove_lock_from_saved_locks(local, this);
+        else
+            ret = afr_add_lock_to_saved_locks(frame, this);
+        if (ret) {
+            local->op_ret = -1;
+            local->op_errno = -ret;
+            goto unlock;
+        }
+        AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+                         &local->cont.lk.ret_flock, local->xdata_rsp);
+    }
+
+    return 0;
+
+unlock:
+    local->cont.lk.user_flock.l_type = F_UNLCK;
+    AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk,
+               local->fd, F_SETLK, &local->cont.lk.user_flock, NULL);
+err:
+    AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+    return -1;
+}
+
 int
 afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
        struct gf_flock *flock, dict_t *xdata)
 {
     afr_private_t *priv = NULL;
     afr_local_t *local = NULL;
+    int ret = 0;
     int i = 0;
     int32_t op_errno = ENOMEM;
 
@@ -4576,9 +5310,11 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
         goto out;
 
     local->op = GF_FOP_LK;
-    if (!afr_lk_is_unlock(cmd, flock) &&
-        !afr_is_consistent_io_possible(local, priv, &op_errno))
-        goto out;
+    if (!afr_lk_is_unlock(cmd, flock)) {
+        AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+        if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+            goto out;
+    }
 
     local->cont.lk.locked_nodes = GF_CALLOC(
         priv->child_count, sizeof(*local->cont.lk.locked_nodes),
@@ -4596,6 +5332,16 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
     if (xdata)
         local->xdata_req = dict_ref(xdata);
 
+    if (afr_is_lock_mode_mandatory(xdata)) {
+        ret = synctask_new(this->ctx->env, afr_lk_transaction,
+                           afr_lk_transaction_cbk, frame, frame);
+        if (ret) {
+            op_errno = ENOMEM;
+            goto out;
+        }
+        return 0;
+    }
+
     STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i],
                       priv->children[i]->fops->lk, fd, cmd, flock,
                       local->xdata_req);
@@ -5593,6 +6339,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
                 __afr_handle_child_up_event(this, child_xlator, idx,
                                             child_latency_msec, &event,
                                             &call_psh, &up_child);
+                __afr_lock_heal_synctask(this, priv, idx);
                 break;
 
             case GF_EVENT_CHILD_DOWN:
@@ -5606,6 +6353,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
                 __afr_handle_child_down_event(this, child_xlator, idx,
                                               child_latency_msec, &event,
                                               &call_psh, &up_child);
+                __afr_mark_pending_lk_heal(this, priv, idx);
                 break;
 
             case GF_EVENT_CHILD_CONNECTING:
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index e8894a62620..c5521704de2 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -302,6 +302,7 @@ afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
     afr_local_t *local = NULL;
     int op_errno = 0;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     local = AFR_FRAME_INIT(frame, op_errno);
     if (!local)
         goto out;
@@ -1698,6 +1699,7 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
     int32_t op_errno = 0;
     fop_fgetxattr_cbk_t cbk = NULL;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     local = AFR_FRAME_INIT(frame, op_errno);
     if (!local)
         goto out;
@@ -1791,6 +1793,7 @@ afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
     afr_local_t *local = NULL;
     int32_t op_errno = 0;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     local = AFR_FRAME_INIT(frame, op_errno);
     if (!local)
         goto out;
@@ -1866,6 +1869,7 @@ afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
     afr_local_t *local = NULL;
     int32_t op_errno = 0;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     local = AFR_FRAME_INIT(frame, op_errno);
     if (!local)
         goto out;
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 9acb4d0e053..a3d2150efe2 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -491,6 +491,7 @@ afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
     int op_errno = ENOMEM;
     int ret = -1;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     local = AFR_FRAME_INIT(frame, op_errno);
     if (!local)
         goto out;
@@ -730,6 +731,7 @@ afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
     int ret = -1;
     int op_errno = ENOMEM;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     transaction_frame = copy_frame(frame);
     if (!transaction_frame)
         goto out;
@@ -940,6 +942,7 @@ afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
     int ret = -1;
     int op_errno = ENOMEM;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     transaction_frame = copy_frame(frame);
     if (!transaction_frame)
         goto out;
@@ -1690,6 +1693,7 @@ afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
 
     GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     transaction_frame = copy_frame(frame);
     if (!transaction_frame)
         goto out;
@@ -1898,6 +1902,7 @@ afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
 
     GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     transaction_frame = copy_frame(frame);
     if (!transaction_frame)
         goto out;
@@ -1998,6 +2003,7 @@ afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
     int ret = -1;
     int op_errno = ENOMEM;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     transaction_frame = copy_frame(frame);
     if (!transaction_frame)
         goto out;
@@ -2107,6 +2113,7 @@ afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
     int ret = -1;
     int op_errno = ENOMEM;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     transaction_frame = copy_frame(frame);
     if (!transaction_frame)
         goto out;
@@ -2213,6 +2220,7 @@ afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
     int ret = -1;
     int op_errno = ENOMEM;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     transaction_frame = copy_frame(frame);
     if (!transaction_frame)
         goto out;
@@ -2412,6 +2420,7 @@ afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
     int ret = -1;
     int op_errno = ENOMEM;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     transaction_frame = copy_frame(frame);
     if (!transaction_frame)
         goto out;
@@ -2507,6 +2516,7 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
     int ret = -1;
     int32_t op_errno = ENOMEM;
 
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
     transaction_frame = copy_frame(frame);
     if (!transaction_frame)
         goto out;
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index b0fb00641a0..816065fb57a 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -31,6 +31,8 @@ enum gf_afr_mem_types_ {
     gf_afr_mt_empty_brick_t,
     gf_afr_mt_child_latency_t,
     gf_afr_mt_atomic_t,
+    gf_afr_mt_lk_heal_info_t,
+    gf_afr_mt_gf_lock,
     gf_afr_mt_end
 };
 #endif
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
index c9c99270e98..8e59c51b993 100644
--- a/xlators/cluster/afr/src/afr-messages.h
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -42,6 +42,6 @@ GLFS_MSGID(AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET,
            AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS,
            AFR_MSG_NO_CHANGELOG, AFR_MSG_TIMER_CREATE_FAIL,
            AFR_MSG_SBRAIN_FAV_CHILD_POLICY, AFR_MSG_INODE_CTX_GET_FAILED,
-           AFR_MSG_THIN_ARB);
+           AFR_MSG_THIN_ARB, AFR_MSG_LK_HEAL_DOM);
 
 #endif /* !_AFR_MESSAGES_H_ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index f8db3c5653f..13b5ca2fce9 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -417,6 +417,8 @@ init(xlator_t *this)
         goto out;
 
     priv = this->private;
+    INIT_LIST_HEAD(&priv->saved_locks);
+    INIT_LIST_HEAD(&priv->lk_healq);
     LOCK_INIT(&priv->lock);
 
     child_count = xlator_subvolume_count(this);
@@ -684,6 +686,7 @@ fini(xlator_t *this)
     priv = this->private;
 
     afr_selfheal_daemon_fini(this);
+    GF_ASSERT(list_empty(&priv->saved_locks));
 
     LOCK(&priv->lock);
     if (priv->timer != NULL) {
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index f86f019e637..28be839ad68 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -39,6 +39,8 @@
 #define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify"
 #define AFR_TA_DOM_MODIFY "afr.ta.dom-modify"
 
+#define AFR_LK_HEAL_DOM "afr.lock-heal.domain"
+
 #define AFR_HALO_MAX_LATENCY 99999
 
 #define PFLAG_PENDING (1 << 0)
@@ -95,6 +97,16 @@ typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);
                    gf_fop_list[local->op], uuid_utoa(local->inode->gfid));     \
     } while (0)
 
+#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label)         \
+    do {                                                                       \
+        afr_fd_ctx_t *__fd_ctx = NULL;                                         \
+        __fd_ctx = afr_fd_ctx_get(__fd, __this);                               \
+        if (__fd_ctx && __fd_ctx->is_fd_bad) {                                 \
+            __error = EBADF;                                                   \
+            goto __label;                                                      \
+        }                                                                      \
+    } while (0)
+
 typedef enum {
     AFR_READ_POLICY_FIRST_UP,
     AFR_READ_POLICY_GFID_HASH,
@@ -143,6 +155,19 @@ struct afr_nfsd {
     gf_boolean_t iamnfsd;
 };
 
+typedef struct _afr_lk_heal_info {
+    fd_t *fd;
+    int32_t cmd;
+    struct gf_flock flock;
+    dict_t *xdata_req;
+    unsigned char *locked_nodes;
+    struct list_head pos;
+    gf_lkowner_t lk_owner;
+    pid_t pid;
+    int32_t *child_up_event_gen;
+    int32_t *child_down_event_gen;
+} afr_lk_heal_info_t;
+
 typedef struct _afr_private {
     gf_lock_t lock;             /* to guard access to child_count, etc */
     unsigned int child_count;   /* total number of children   */
@@ -249,6 +274,10 @@ typedef struct _afr_private {
     gf_boolean_t esh_granular;
     gf_boolean_t consistent_io;
     gf_boolean_t data_self_heal; /* on/off */
+
+    /*For lock healing.*/
+    struct list_head saved_locks;
+    struct list_head lk_healq;
 } afr_private_t;
 
 typedef enum {
@@ -371,6 +400,10 @@ typedef struct {
        arrives, we continue to read off this subvol.
     */
     int readdir_subvol;
+    /* lock-healing related members. */
+    gf_boolean_t is_fd_bad;
+    afr_lk_heal_info_t *lk_heal_info;
+
 } afr_fd_ctx_t;
 
 typedef enum {
@@ -572,6 +605,11 @@ typedef struct _afr_local {
             struct gf_flock ret_flock;
             unsigned char *locked_nodes;
             int32_t cmd;
+            /*For lock healing only.*/
+            unsigned char *dom_locked_nodes;
+            int32_t *dom_lock_op_ret;
+            int32_t *dom_lock_op_errno;
+            struct gf_flock *getlk_rsp;
         } lk;
 
         /* inode read */
@@ -1074,6 +1112,8 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
             if (__local && __local->is_read_txn)                               \
                 afr_pending_read_decrement(__this->private,                    \
                                            __local->read_subvol);              \
+            if (__local && afr_is_lock_mode_mandatory(__local->xdata_req))     \
+                afr_dom_lock_release(frame);                                   \
             frame->local = NULL;                                               \
         }                                                                      \
                                                                                \
@@ -1354,4 +1394,10 @@ afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv,
 
 void
 afr_selfheal_childup(xlator_t *this, afr_private_t *priv);
+
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata);
+
+void
+afr_dom_lock_release(call_frame_t *frame);
 #endif /* __AFR_H__ */
-- 
cgit