summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2019-10-16 13:06:29 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2019-10-30 11:34:37 +0000
commit593150979399f7f11e580591eab4b032bb0228ac (patch)
tree02cc3d061277fe71760e3b0ba02c6229bfda249f
parent9813ff934f43d09ecbd60b9f73c624b04b7dcc77 (diff)
afr: lock healing changes
Implements lock healing for gluster-block fencing use case. If mandatory lock is enabled: - Add domain lock/unlock to afr_lk fop. - Maintain a list of locks to be healed in afr_private_t. - Add lock to the list if afr_lk(F_SETLK or F_SETLKW) was sucessful. - Remove it from the list during afr_lk(F_UNLCK). - On child_down, mark lock as needing heal on that child. If lock is lost on quorum no. of bricks, remove it from the list and mark fd bad. - For fds marked as bad, fail the subsequent fd based fops. - On parent up, traverse the list and heal the locks IFF the client is the lk owner and has quorum. (shd does not heal any locks). updates: #613 Change-Id: I03c46ceaea30f5e6236d5ec13f71d843d827f1bc Signed-off-by: Ravishankar N <ravishankar@redhat.com>
-rw-r--r--tests/basic/fencing/afr-lock-heal-advanced.c227
-rw-r--r--tests/basic/fencing/afr-lock-heal-advanced.t104
-rw-r--r--tests/basic/fencing/afr-lock-heal-basic.c182
-rw-r--r--tests/basic/fencing/afr-lock-heal-basic.t99
-rw-r--r--xlators/cluster/afr/src/afr-common.c818
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c4
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c10
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h2
-rw-r--r--xlators/cluster/afr/src/afr-messages.h2
-rw-r--r--xlators/cluster/afr/src/afr.c3
-rw-r--r--xlators/cluster/afr/src/afr.h46
11 files changed, 1461 insertions, 36 deletions
diff --git a/tests/basic/fencing/afr-lock-heal-advanced.c b/tests/basic/fencing/afr-lock-heal-advanced.c
new file mode 100644
index 0000000..e202ccd
--- /dev/null
+++ b/tests/basic/fencing/afr-lock-heal-advanced.c
@@ -0,0 +1,227 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+#include <glusterfs/api/glfs.h>
+#include <glusterfs/api/glfs-handles.h>
+
+#define GF_ENFORCE_MANDATORY_LOCK "trusted.glusterfs.enforce-mandatory-lock"
+
+FILE *logfile_fp;
+
+#define LOG_ERR(func, err) \
+ do { \
+ if (!logfile_fp) { \
+ fprintf(stderr, "%\n%d %s : returned error (%s)\n", __LINE__, \
+ func, strerror(err)); \
+ fflush(stderr); \
+ } else { \
+ fprintf(logfile_fp, "\n%d %s : returned error (%s)\n", __LINE__, \
+ func, strerror(err)); \
+ fflush(logfile_fp); \
+ } \
+ } while (0)
+
+glfs_t *
+setup_client(char *hostname, char *volname, char *log_file)
+{
+ int ret = 0;
+ glfs_t *fs = NULL;
+
+ fs = glfs_new(volname);
+ if (!fs) {
+ fprintf(logfile_fp, "\nglfs_new: returned NULL (%s)\n",
+ strerror(errno));
+ goto error;
+ }
+
+ ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007);
+ if (ret < 0) {
+ fprintf(logfile_fp, "\nglfs_set_volfile_server failed ret:%d (%s)\n",
+ ret, strerror(errno));
+ goto error;
+ }
+
+ ret = glfs_set_logging(fs, log_file, 7);
+ if (ret < 0) {
+ fprintf(logfile_fp, "\nglfs_set_logging failed with ret: %d (%s)\n",
+ ret, strerror(errno));
+ goto error;
+ }
+
+ ret = glfs_init(fs);
+ if (ret < 0) {
+ fprintf(logfile_fp, "\nglfs_init failed with ret: %d (%s)\n", ret,
+ strerror(errno));
+ goto error;
+ }
+
+out:
+ return fs;
+error:
+ return NULL;
+}
+
+glfs_fd_t *
+open_file(glfs_t *fs, char *fname)
+{
+ glfs_fd_t *fd = NULL;
+
+ fd = glfs_creat(fs, fname, O_CREAT, 0644);
+ if (!fd) {
+ LOG_ERR("glfs_creat", errno);
+ goto out;
+ }
+out:
+ return fd;
+}
+
+int
+acquire_mandatory_lock(glfs_t *fs, glfs_fd_t *fd)
+{
+ struct flock lock;
+ int ret = 0;
+
+ /* initialize lock */
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 100;
+
+ ret = glfs_fsetxattr(fd, GF_ENFORCE_MANDATORY_LOCK, "set", 8, 0);
+ if (ret < 0) {
+ LOG_ERR("glfs_fsetxattr", errno);
+ ret = -1;
+ goto out;
+ }
+
+ /* take a write mandatory lock */
+ ret = glfs_file_lock(fd, F_SETLKW, &lock, GLFS_LK_MANDATORY);
+ if (ret) {
+ LOG_ERR("glfs_file_lock", errno);
+ ret = -1;
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+int
+perform_test(glfs_t *fs, char *file1, char *file2)
+{
+ int ret = 0;
+ glfs_fd_t *fd1 = NULL;
+ glfs_fd_t *fd2 = NULL;
+ char *buf = "0123456789";
+
+ fd1 = open_file(fs, file1);
+ if (!fd1) {
+ ret = -1;
+ goto out;
+ }
+ fd2 = open_file(fs, file2);
+ if (!fd2) {
+ ret = -1;
+ goto out;
+ }
+
+ /* Kill one brick from the .t.*/
+ pause();
+
+ ret = acquire_mandatory_lock(fs, fd1);
+ if (ret) {
+ goto out;
+ }
+ ret = acquire_mandatory_lock(fs, fd2);
+ if (ret) {
+ goto out;
+ }
+
+ /* Bring the brick up and let the locks heal. */
+ pause();
+ /*At this point, the .t would have killed and brought back 2 bricks, marking
+ * the fd bad.*/
+
+ ret = glfs_write(fd1, buf, 10, 0);
+ if (ret > 0) {
+ /* Write is supposed to fail with EBADFD*/
+ LOG_ERR("glfs_write", ret);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (fd1)
+ glfs_close(fd1);
+ if (fd2)
+ glfs_close(fd2);
+ return ret;
+}
+
+static void
+sigusr1_handler(int signo)
+{
+ /*Signal caught. Just continue with the execution.*/
+}
+
+int
+main(int argc, char *argv[])
+{
+ int ret = 0;
+ glfs_t *fs = NULL;
+ char *volname = NULL;
+ char log_file[100];
+ char *hostname = NULL;
+ char *fname1 = NULL;
+ char *fname2 = NULL;
+
+ if (argc != 7) {
+ fprintf(stderr,
+ "Expect following args %s <host> <volname> <file1> <file2> "
+ "<log file "
+ "location> <log_file_suffix>\n",
+ argv[0]);
+ return -1;
+ }
+
+ hostname = argv[1];
+ volname = argv[2];
+ fname1 = argv[3];
+ fname2 = argv[4];
+
+ /*Use SIGUSR1 and pause()as a means of hitting break-points this program
+ *when signalled from the .t test case.*/
+ if (signal(SIGUSR1, sigusr1_handler) == SIG_ERR) {
+ LOG_ERR("SIGUSR1 handler error", errno);
+ exit(EXIT_FAILURE);
+ }
+
+ sprintf(log_file, "%s/%s.%s.%s", argv[5], "lock-heal.c", argv[6], "log");
+ logfile_fp = fopen(log_file, "w");
+ if (!logfile_fp) {
+ fprintf(stderr, "\nfailed to open %s\n", log_file);
+ fflush(stderr);
+ return -1;
+ }
+
+ sprintf(log_file, "%s/%s.%s.%s", argv[5], "glfs-client", argv[6], "log");
+ fs = setup_client(hostname, volname, log_file);
+ if (!fs) {
+ LOG_ERR("setup_client", errno);
+ return -1;
+ }
+
+ ret = perform_test(fs, fname1, fname2);
+
+error:
+ if (fs) {
+ /*glfs_fini(fs)*/; // glfs fini path is racy and crashes the program
+ }
+
+ fclose(logfile_fp);
+
+ return ret;
+}
diff --git a/tests/basic/fencing/afr-lock-heal-advanced.t b/tests/basic/fencing/afr-lock-heal-advanced.t
new file mode 100644
index 0000000..8a7a208
--- /dev/null
+++ b/tests/basic/fencing/afr-lock-heal-advanced.t
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+function is_gfapi_program_alive()
+{
+ pid=$1
+ ps -p $pid
+ if [ $? -eq 0 ]
+ then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
+
+function get_active_lock_count {
+ brick=$1
+ sdump=$(generate_brick_statedump $V0 $H0 $brick)
+ lock_count="$(grep ACTIVE $sdump| wc -l)"
+ echo "$lock_count"
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+EXPECT 'Created' volinfo_field $V0 'Status';
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.open-behind off
+TEST $CLI volume set $V0 locks.mandatory-locking forced
+TEST $CLI volume set $V0 enforce-mandatory-lock on
+TEST $CLI volume start $V0;
+EXPECT 'Started' volinfo_field $V0 'Status';
+
+logdir=`gluster --print-logdir`
+TEST build_tester $(dirname $0)/afr-lock-heal-advanced.c -lgfapi -ggdb
+
+#------------------------------------------------------------------------------
+# Use more than 1 fd from same client so that list_for_each_* loops are executed more than once.
+$(dirname $0)/afr-lock-heal-advanced $H0 $V0 "/FILE1" "/FILE2" $logdir C1&
+client_pid=$!
+TEST [ $client_pid ]
+
+TEST sleep 5 # By now, the client would have opened an fd on FILE1 and FILE2 and waiting for a SIGUSR1.
+EXPECT "Y" is_gfapi_program_alive $client_pid
+
+# Kill brick-3 and let client-1 take lock on both files.
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST kill -SIGUSR1 $client_pid
+# If program is still alive, glfs_file_lock() was a success.
+EXPECT "Y" is_gfapi_program_alive $client_pid
+
+# Check lock is present on brick-1 and brick-2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}1
+
+# Restart brick-3 and check that the lock has healed on it.
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
+TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd.
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}2
+
+#------------------------------------------------------------------------------
+# Kill same brick before heal completes the first time and check it completes the second time.
+TEST $CLI volume set $V0 delay-gen locks
+TEST $CLI volume set $V0 delay-gen.delay-duration 5000000
+TEST $CLI volume set $V0 delay-gen.delay-percentage 100
+TEST $CLI volume set $V0 delay-gen.enable finodelk
+
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST $CLI volume reset $V0 delay-gen
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}0
+
+#------------------------------------------------------------------------------
+# Kill 2 bricks and bring it back. The fds must be marked bad.
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+
+# TODO: `gluster v statedump $V0 client localhost:$client_pid` is not working,
+# so sleep for 20 seconds for the client to connect to connect to the bricks.
+TEST sleep $CHILD_UP_TIMEOUT
+
+# Try to write to FILE1 from the .c; it must fail.
+TEST kill -SIGUSR1 $client_pid
+wait $client_pid
+ret=$?
+TEST [ $ret == 0 ]
+
+cleanup_tester $(dirname $0)/afr-lock-heal-advanced
+cleanup;
diff --git a/tests/basic/fencing/afr-lock-heal-basic.c b/tests/basic/fencing/afr-lock-heal-basic.c
new file mode 100644
index 0000000..768c9e5
--- /dev/null
+++ b/tests/basic/fencing/afr-lock-heal-basic.c
@@ -0,0 +1,182 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+#include <glusterfs/api/glfs.h>
+#include <glusterfs/api/glfs-handles.h>
+
+#define GF_ENFORCE_MANDATORY_LOCK "trusted.glusterfs.enforce-mandatory-lock"
+
+FILE *logfile_fp;
+
+#define LOG_ERR(func, err) \
+ do { \
+ if (!logfile_fp) { \
+ fprintf(stderr, "%\n%d %s : returned error (%s)\n", __LINE__, \
+ func, strerror(err)); \
+ fflush(stderr); \
+ } else { \
+ fprintf(logfile_fp, "\n%d %s : returned error (%s)\n", __LINE__, \
+ func, strerror(err)); \
+ fflush(logfile_fp); \
+ } \
+ } while (0)
+
+glfs_t *
+setup_client(char *hostname, char *volname, char *log_file)
+{
+ int ret = 0;
+ glfs_t *fs = NULL;
+
+ fs = glfs_new(volname);
+ if (!fs) {
+ fprintf(logfile_fp, "\nglfs_new: returned NULL (%s)\n",
+ strerror(errno));
+ goto error;
+ }
+
+ ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007);
+ if (ret < 0) {
+ fprintf(logfile_fp, "\nglfs_set_volfile_server failed ret:%d (%s)\n",
+ ret, strerror(errno));
+ goto error;
+ }
+
+ ret = glfs_set_logging(fs, log_file, 7);
+ if (ret < 0) {
+ fprintf(logfile_fp, "\nglfs_set_logging failed with ret: %d (%s)\n",
+ ret, strerror(errno));
+ goto error;
+ }
+
+ ret = glfs_init(fs);
+ if (ret < 0) {
+ fprintf(logfile_fp, "\nglfs_init failed with ret: %d (%s)\n", ret,
+ strerror(errno));
+ goto error;
+ }
+
+out:
+ return fs;
+error:
+ return NULL;
+}
+
+int
+acquire_mandatory_lock(glfs_t *fs, char *fname)
+{
+ struct flock lock;
+ int ret = 0;
+ glfs_fd_t *fd = NULL;
+
+ fd = glfs_creat(fs, fname, O_CREAT, 0644);
+ if (!fd) {
+ if (errno != EEXIST) {
+ LOG_ERR("glfs_creat", errno);
+ ret = -1;
+ goto out;
+ }
+ fd = glfs_open(fs, fname, O_RDWR | O_NONBLOCK);
+ if (!fd) {
+ LOG_ERR("glfs_open", errno);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* initialize lock */
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 100;
+
+ ret = glfs_fsetxattr(fd, GF_ENFORCE_MANDATORY_LOCK, "set", 8, 0);
+ if (ret < 0) {
+ LOG_ERR("glfs_fsetxattr", errno);
+ ret = -1;
+ goto out;
+ }
+
+ pause();
+
+ /* take a write mandatory lock */
+ ret = glfs_file_lock(fd, F_SETLKW, &lock, GLFS_LK_MANDATORY);
+ if (ret) {
+ LOG_ERR("glfs_file_lock", errno);
+ goto out;
+ }
+
+ pause();
+
+out:
+ if (fd) {
+ glfs_close(fd);
+ }
+
+ return ret;
+}
+
+static void
+sigusr1_handler(int signo)
+{
+ /*Signal caught. Just continue with the execution.*/
+}
+
+int
+main(int argc, char *argv[])
+{
+ int ret = 0;
+ glfs_t *fs = NULL;
+ char *volname = NULL;
+ char log_file[100];
+ char *hostname = NULL;
+ char *fname = NULL;
+
+ if (argc != 6) {
+ fprintf(stderr,
+ "Expect following args %s <host> <volname> <file> <log file "
+ "location> <log_file_suffix>\n",
+ argv[0]);
+ return -1;
+ }
+
+ hostname = argv[1];
+ volname = argv[2];
+ fname = argv[3];
+
+ /*Use SIGUSR1 and pause()as a means of hitting break-points this program
+ *when signalled from the .t test case.*/
+ if (signal(SIGUSR1, sigusr1_handler) == SIG_ERR) {
+ LOG_ERR("SIGUSR1 handler error", errno);
+ exit(EXIT_FAILURE);
+ }
+
+ sprintf(log_file, "%s/%s.%s.%s", argv[4], "lock-heal-basic.c", argv[5],
+ "log");
+ logfile_fp = fopen(log_file, "w");
+ if (!logfile_fp) {
+ fprintf(stderr, "\nfailed to open %s\n", log_file);
+ fflush(stderr);
+ return -1;
+ }
+
+ sprintf(log_file, "%s/%s.%s.%s", argv[4], "glfs-client", argv[5], "log");
+ fs = setup_client(hostname, volname, log_file);
+ if (!fs) {
+ LOG_ERR("setup_client", errno);
+ return -1;
+ }
+
+ ret = acquire_mandatory_lock(fs, fname);
+
+error:
+ if (fs) {
+ /*glfs_fini(fs)*/; // glfs fini path is racy and crashes the program
+ }
+
+ fclose(logfile_fp);
+
+ return ret;
+}
diff --git a/tests/basic/fencing/afr-lock-heal-basic.t b/tests/basic/fencing/afr-lock-heal-basic.t
new file mode 100644
index 0000000..5ac05c7
--- /dev/null
+++ b/tests/basic/fencing/afr-lock-heal-basic.t
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+function is_gfapi_program_alive()
+{
+ pid=$1
+ ps -p $pid
+ if [ $? -eq 0 ]
+ then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+EXPECT 'Created' volinfo_field $V0 'Status';
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.open-behind off
+TEST $CLI volume set $V0 locks.mandatory-locking forced
+TEST $CLI volume set $V0 enforce-mandatory-lock on
+TEST $CLI volume start $V0;
+EXPECT 'Started' volinfo_field $V0 'Status';
+
+logdir=`gluster --print-logdir`
+TEST build_tester $(dirname $0)/afr-lock-heal-basic.c -lgfapi -ggdb
+
+$(dirname $0)/afr-lock-heal-basic $H0 $V0 "/FILE" $logdir C1&
+client1_pid=$!
+TEST [ $client1_pid ]
+
+$(dirname $0)/afr-lock-heal-basic $H0 $V0 "/FILE" $logdir C2&
+client2_pid=$!
+TEST [ $client2_pid ]
+
+TEST sleep 5 # By now, the 2 clients would have opened an fd on FILE and waiting for a SIGUSR1.
+EXPECT "Y" is_gfapi_program_alive $client1_pid
+EXPECT "Y" is_gfapi_program_alive $client2_pid
+
+# Kill brick-3 and let client-1 take lock on the file.
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST kill -SIGUSR1 $client1_pid
+# If program is still alive, glfs_file_lock() was a success.
+EXPECT "Y" is_gfapi_program_alive $client1_pid
+
+# Check lock is present on brick-1 and brick-2
+b1_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}0)
+b2_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}1)
+c1_lock_on_b1="$(grep ACTIVE $b1_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+c1_lock_on_b2="$(grep ACTIVE $b2_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+TEST [ "$c1_lock_on_b1" == "$c1_lock_on_b2" ]
+
+# Restart brick-3 and check that the lock has healed on it.
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
+TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd. Also wait for lock heal.
+
+b3_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}2)
+c1_lock_on_b3="$(grep ACTIVE $b3_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+TEST [ "$c1_lock_on_b1" == "$c1_lock_on_b3" ]
+
+# Kill brick-1 and let client-2 preempt the lock on bricks 2 and 3.
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST kill -SIGUSR1 $client2_pid
+# If program is still alive, glfs_file_lock() was a success.
+EXPECT "Y" is_gfapi_program_alive $client2_pid
+
+# Restart brick-1 and let lock healing complete.
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd. Also wait for lock heal.
+
+# Check that all bricks now have locks from client 2 only.
+b1_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}0)
+b2_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}1)
+b3_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}2)
+c2_lock_on_b1="$(grep ACTIVE $b1_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+c2_lock_on_b2="$(grep ACTIVE $b2_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+c2_lock_on_b3="$(grep ACTIVE $b3_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')"
+TEST [ "$c2_lock_on_b1" == "$c2_lock_on_b2" ]
+TEST [ "$c2_lock_on_b1" == "$c2_lock_on_b3" ]
+TEST [ "$c2_lock_on_b1" != "$c1_lock_on_b1" ]
+
+#Let the client programs run and exit.
+TEST kill -SIGUSR1 $client1_pid
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "N" is_gfapi_program_alive $client1_pid
+TEST kill -SIGUSR1 $client2_pid
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "N" is_gfapi_program_alive $client2_pid
+
+cleanup_tester $(dirname $0)/afr-lock-heal-basic
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 4b22af7..07bf53a 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -45,6 +45,21 @@ afr_quorum_errno(afr_private_t *priv)
return ENOTCONN;
}
+static void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies)
+{
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ replies[i] = 1;
+ } else {
+ replies[i] = 0;
+ }
+ }
+}
+
int
afr_fav_child_reset_sink_xattrs(void *opaque);
@@ -54,6 +69,581 @@ afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque);
static void
afr_discover_done(call_frame_t *frame, xlator_t *this);
+int
+afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
+
+ local->cont.lk.dom_lock_op_ret[i] = op_ret;
+ local->cont.lk.dom_lock_op_errno[i] = op_errno;
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to acquire %s on %s",
+ uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM,
+ priv->children[i]->name);
+ } else {
+ local->cont.lk.dom_locked_nodes[i] = 1;
+ }
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
+
+int
+afr_dom_lock_acquire(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+ int i = 0;
+
+ priv = frame->this->private;
+ local = frame->local;
+ local->cont.lk.dom_locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+ gf_afr_mt_char);
+ if (!local->cont.lk.dom_locked_nodes) {
+ return -ENOMEM;
+ }
+ local->cont.lk.dom_lock_op_ret = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_ret) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ local->cont.lk.dom_lock_op_errno = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_errno) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ flock.l_type = F_WRLCK;
+
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLK, &flock, NULL);
+
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL))
+ goto blocking_lock;
+
+ /*If any of the bricks returned EAGAIN, we still need blocking locks.*/
+ if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) !=
+ priv->child_count) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.dom_lock_op_ret[i] == -1 &&
+ local->cont.lk.dom_lock_op_errno[i] == EAGAIN)
+ goto blocking_lock;
+ }
+ }
+
+ return 0;
+
+blocking_lock:
+ afr_dom_lock_release(frame);
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLKW, &flock, NULL);
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) {
+ afr_dom_lock_release(frame);
+ return -afr_quorum_errno(priv);
+ }
+
+ return 0;
+}
+
+int
+afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
+
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to release %s on %s", local->loc.path,
+ AFR_LK_HEAL_DOM, priv->children[i]->name);
+ }
+ local->cont.lk.dom_locked_nodes[i] = 0;
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
+
+void
+afr_dom_lock_release(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = frame->this->private;
+ locked_on = local->cont.lk.dom_locked_nodes;
+ if (AFR_COUNT(locked_on, priv->child_count) == 0)
+ return;
+ flock.l_type = F_UNLCK;
+
+ AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk,
+ AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL);
+
+ return;
+}
+
+static void
+afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info)
+{
+ if (!info)
+ return;
+ if (info->xdata_req)
+ dict_unref(info->xdata_req);
+ if (info->fd)
+ fd_unref(info->fd);
+ GF_FREE(info->locked_nodes);
+ GF_FREE(info->child_up_event_gen);
+ GF_FREE(info->child_down_event_gen);
+ GF_FREE(info);
+}
+
+static int
+afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -ENOMEM;
+
+ info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t);
+ if (!info) {
+ goto cleanup;
+ }
+ INIT_LIST_HEAD(&info->pos);
+ info->fd = fd_ref(local->fd);
+ info->cmd = local->cont.lk.cmd;
+ info->pid = frame->root->pid;
+ info->flock = local->cont.lk.user_flock;
+ info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL);
+ if (!info->xdata_req) {
+ goto cleanup;
+ }
+ info->lk_owner = frame->root->lk_owner;
+ info->locked_nodes = GF_MALLOC(
+ sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char);
+ if (!info->locked_nodes) {
+ goto cleanup;
+ }
+ memcpy(info->locked_nodes, local->cont.lk.locked_nodes,
+ sizeof(*info->locked_nodes) * priv->child_count);
+ info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen),
+ priv->child_count, gf_afr_mt_int32_t);
+ if (!info->child_up_event_gen) {
+ goto cleanup;
+ }
+ info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen),
+ priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!info->child_down_event_gen) {
+ goto cleanup;
+ }
+
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(local->fd, this);
+ if (fd_ctx)
+ fd_ctx->lk_heal_info = info;
+ }
+ UNLOCK(&local->fd->lock);
+ if (!fd_ctx) {
+ goto cleanup;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&info->pos, &priv->saved_locks);
+ }
+ UNLOCK(&priv->lock);
+
+ return 0;
+cleanup:
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to add lock to healq",
+ uuid_utoa(local->fd->inode->gfid));
+ if (info) {
+ afr_lk_heal_info_cleanup(info);
+ if (fd_ctx) {
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx->lk_heal_info = NULL;
+ }
+ UNLOCK(&local->fd->lock);
+ }
+ }
+ return ret;
+}
+
+static int
+afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ struct gf_flock flock = local->cont.lk.user_flock;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -EINVAL;
+
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx || !fd_ctx->lk_heal_info) {
+ goto out;
+ }
+
+ info = fd_ctx->lk_heal_info;
+ if ((info->flock.l_start != flock.l_start) ||
+ (info->flock.l_whence != flock.l_whence) ||
+ (info->flock.l_len != flock.l_len)) {
+ /*TODO: Compare lkowners too.*/
+ goto out;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_del(&fd_ctx->lk_heal_info->pos);
+ }
+ UNLOCK(&priv->lock);
+
+ afr_lk_heal_info_cleanup(info);
+ fd_ctx->lk_heal_info = NULL;
+ ret = 0;
+out:
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to remove lock from healq",
+ uuid_utoa(local->fd->inode->gfid));
+ return ret;
+}
+
+int
+afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed to heal lock on child %d for %s", i,
+ uuid_utoa(local->fd->inode->gfid));
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid));
+ } else {
+ local->cont.lk.getlk_rsp[i] = *lock;
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+static gf_boolean_t
+afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
+{
+ int i = 0;
+ afr_local_t *local = frame->local;
+ struct gf_flock flock = {
+ 0,
+ };
+ gf_boolean_t ret = _gf_true;
+ char *wind_on = alloca0(priv->child_count);
+ unsigned char *success_replies = alloca0(priv->child_count);
+ local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp),
+ priv->child_count, gf_afr_mt_gf_lock);
+
+ flock = info->flock;
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ wind_on[i] = 1;
+ }
+
+ AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock,
+ info->xdata_req);
+
+ afr_fill_success_replies(local, priv, success_replies);
+ if (AFR_COUNT(success_replies, priv->child_count) == 0) {
+ ret = _gf_false;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid || local->replies[i].op_ret != 0)
+ continue;
+ if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK)
+ continue;
+ /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/
+ if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner,
+ &info->lk_owner)) {
+ ret = _gf_false;
+ break;
+ }
+ }
+out:
+ afr_local_replies_wipe(local, priv);
+ GF_FREE(local->cont.lk.getlk_rsp);
+ local->cont.lk.getlk_rsp = NULL;
+ return ret;
+}
+
+static void
+afr_mark_fd_bad(fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ if (!fd)
+ return;
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(fd, this);
+ if (fd_ctx) {
+ fd_ctx->is_fd_bad = _gf_true;
+ fd_ctx->lk_heal_info = NULL;
+ }
+ }
+ UNLOCK(&fd->lock);
+}
+
+static void
+afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info)
+{
+ LOCK(&priv->lock);
+ {
+ list_del(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+}
+
+static void
+afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
+{
+ int i = 0;
+ int op_errno = 0;
+ int32_t *current_event_gen = NULL;
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ char *wind_on = alloca0(priv->child_count);
+ gf_boolean_t retry = _gf_true;
+
+ frame->root->pid = info->pid;
+ lk_owner_copy(&frame->root->lk_owner, &info->lk_owner);
+
+ op_errno = -afr_dom_lock_acquire(frame);
+ if ((op_errno != 0)) {
+ goto release;
+ }
+
+ if (!afr_does_lk_owner_match(frame, priv, info)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM,
+ "Ignoring lock heal for %s since lk-onwers mismatch. "
+ "Lock possibly pre-empted by another client.",
+ uuid_utoa(info->fd->inode->gfid));
+ goto release;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ continue;
+ wind_on[i] = 1;
+ }
+
+ current_event_gen = alloca(priv->child_count);
+ memcpy(current_event_gen, info->child_up_event_gen,
+ priv->child_count * sizeof *current_event_gen);
+ AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd,
+ &info->flock, info->xdata_req);
+
+ LOCK(&priv->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) {
+ continue;
+ }
+
+ if ((current_event_gen[i] == info->child_up_event_gen[i]) &&
+ (current_event_gen[i] > info->child_down_event_gen[i])) {
+ info->locked_nodes[i] = 1;
+ retry = _gf_false;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->saved_locks);
+ } else {
+ /*We received subsequent child up/down events while heal was in
+ * progress; don't mark child as healed. Attempt again on the
+ * new child up*/
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM,
+ "Event gen mismatch: skipped healing lock on child %d "
+ "for %s.",
+ i, uuid_utoa(info->fd->inode->gfid));
+ }
+ }
+ }
+ UNLOCK(&priv->lock);
+
+release:
+ afr_dom_lock_release(frame);
+ if (retry)
+ afr_add_lock_to_lkhealq(priv, info);
+ return;
+}
+
+static int
+afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+ STACK_DESTROY(frame->root);
+ return 0;
+}
+
+static int
+afr_lock_heal(void *opaque)
+{
+ call_frame_t *frame = (call_frame_t *)opaque;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+ struct list_head healq = {
+ 0,
+ };
+ int ret = 0;
+
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame) {
+ return ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&healq);
+ LOCK(&priv->lock);
+ {
+ list_splice_init(&priv->lk_healq, &healq);
+ }
+ UNLOCK(&priv->lock);
+
+ list_for_each_entry_safe(info, tmp, &healq, pos)
+ {
+ GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) <
+ priv->child_count));
+ ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd);
+ afr_lock_heal_do(iter_frame, priv, info);
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = ENOTCONN;
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN,
+ AFR_MSG_LK_HEAL_DOM,
+ "Aborting processing of lk_healq."
+ "Healing will be reattempted on next child up for locks "
+ "that are still in quorum.");
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&healq, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+ break;
+ }
+ }
+
+ AFR_STACK_DESTROY(iter_frame);
+ return ret;
+}
+
+static int
+__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+
+ if (priv->shd.iamshd)
+ return 0;
+
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_up_event_gen[child] = priv->event_generation;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
+
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ return -1;
+
+ ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame,
+ frame);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM,
+ "Failed to launch lock heal synctask");
+
+ return ret;
+}
+
+static int
+__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child)
+{
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+
+ if (priv->shd.iamshd)
+ return 0;
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_down_event_gen[child] = priv->event_generation;
+ if (info->locked_nodes[child] == 1)
+ info->locked_nodes[child] = 0;
+ if (!afr_has_quorum(info->locked_nodes, this, NULL)) {
+ /* Since the lock was lost on quorum no. of nodes, we should
+ * not attempt to heal it anymore. Some other client could have
+ * acquired the lock, modified data and released it and this
+ * client wouldn't know about it if we heal it.*/
+ afr_mark_fd_bad(info->fd, this);
+ list_del(&info->pos);
+ afr_lk_heal_info_cleanup(info);
+ /* We're not winding an unlock on the node where the lock is still
+ * present because when fencing logic switches over to the new
+ * client (since we marked the fd bad), it should preempt any
+ * existing lock. */
+ }
+ }
+ return 0;
+}
+
gf_boolean_t
afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
int32_t *op_errno)
@@ -68,6 +658,19 @@ afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
return _gf_true;
}
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata)
+{
+ int ret = 0;
+ uint32_t lk_mode = GF_LK_ADVISORY;
+
+ ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode);
+ if (!ret && lk_mode == GF_LK_MANDATORY)
+ return _gf_true;
+
+ return _gf_false;
+}
+
call_frame_t *
afr_copy_frame(call_frame_t *base)
{
@@ -1224,18 +1827,6 @@ refresh_done:
return 0;
}
-static void
-afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
- unsigned char *replies)
-{
- int i = 0;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->replies[i].valid && local->replies[i].op_ret == 0)
- replies[i] = 1;
- }
-}
-
int
afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
{
@@ -2049,6 +2640,9 @@ afr_local_cleanup(afr_local_t *local, xlator_t *this)
{ /* lk */
GF_FREE(local->cont.lk.locked_nodes);
+ GF_FREE(local->cont.lk.dom_locked_nodes);
+ GF_FREE(local->cont.lk.dom_lock_op_ret);
+ GF_FREE(local->cont.lk.dom_lock_op_errno);
}
{ /* create */
@@ -3451,8 +4045,18 @@ out:
}
void
-_afr_cleanup_fd_ctx(afr_fd_ctx_t *fd_ctx)
+_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx)
{
+ afr_private_t *priv = this->private;
+
+ if (fd_ctx->lk_heal_info) {
+ LOCK(&priv->lock);
+ {
+ list_del(&fd_ctx->lk_heal_info->pos);
+ }
+ afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info);
+ fd_ctx->lk_heal_info = NULL;
+ }
GF_FREE(fd_ctx->opened_on);
GF_FREE(fd_ctx);
return;
@@ -3472,7 +4076,7 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd)
fd_ctx = (afr_fd_ctx_t *)(long)ctx;
if (fd_ctx) {
- _afr_cleanup_fd_ctx(fd_ctx);
+ _afr_cleanup_fd_ctx(this, fd_ctx);
}
out:
@@ -3565,13 +4169,14 @@ __afr_fd_ctx_set(xlator_t *this, fd_t *fd)
}
fd_ctx->readdir_subvol = -1;
+ fd_ctx->lk_heal_info = NULL;
ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx);
if (ret)
gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd);
out:
if (ret && fd_ctx)
- _afr_cleanup_fd_ctx(fd_ctx);
+ _afr_cleanup_fd_ctx(this, fd_ctx);
return ret;
}
@@ -3694,6 +4299,7 @@ afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
call_stub_t *stub = NULL;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -4230,9 +4836,9 @@ out:
}
static int32_t
-afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
- loc_t *loc, fd_t *fd, int32_t cmd, struct gf_flock *flock,
- dict_t *xdata)
+afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
{
afr_local_t *local = NULL;
int32_t op_errno = ENOMEM;
@@ -4244,8 +4850,10 @@ afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
local->op = fop;
if (loc)
loc_copy(&local->loc, loc);
- if (fd)
+ if (fd && (flock->l_type != F_UNLCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local->fd = fd_ref(fd);
+ }
local->cont.inodelk.volume = gf_strdup(volume);
if (!local->cont.inodelk.volume) {
@@ -4274,8 +4882,8 @@ int32_t
afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- afr_handle_inodelk(frame, GF_FOP_INODELK, volume, loc, NULL, cmd, flock,
- xdata);
+ afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd,
+ flock, xdata);
return 0;
}
@@ -4283,15 +4891,16 @@ int32_t
afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- afr_handle_inodelk(frame, GF_FOP_FINODELK, volume, NULL, fd, cmd, flock,
- xdata);
+ afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd,
+ flock, xdata);
return 0;
}
static int
-afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
- loc_t *loc, fd_t *fd, const char *basename, entrylk_cmd cmd,
- entrylk_type type, dict_t *xdata)
+afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
{
afr_local_t *local = NULL;
int32_t op_errno = ENOMEM;
@@ -4303,8 +4912,10 @@ afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
local->op = fop;
if (loc)
loc_copy(&local->loc, loc);
- if (fd)
+ if (fd && (cmd != ENTRYLK_UNLOCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local->fd = fd_ref(fd);
+ }
local->cont.entrylk.cmd = cmd;
local->cont.entrylk.in_cmd = cmd;
local->cont.entrylk.type = type;
@@ -4331,8 +4942,8 @@ afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
const char *basename, entrylk_cmd cmd, entrylk_type type,
dict_t *xdata)
{
- afr_handle_entrylk(frame, GF_FOP_ENTRYLK, volume, loc, NULL, basename, cmd,
- type, xdata);
+ afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename,
+ cmd, type, xdata);
return 0;
}
@@ -4341,8 +4952,8 @@ afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
const char *basename, entrylk_cmd cmd, entrylk_type type,
dict_t *xdata)
{
- afr_handle_entrylk(frame, GF_FOP_FENTRYLK, volume, NULL, fd, basename, cmd,
- type, xdata);
+ afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename,
+ cmd, type, xdata);
return 0;
}
@@ -4460,9 +5071,10 @@ afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
call_count = afr_frame_return(frame);
- if (call_count == 0)
+ if (call_count == 0) {
AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
local->xdata_rsp);
+ }
return 0;
}
@@ -4561,11 +5173,133 @@ afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
}
int
+afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+ return 0;
+}
+
+int
+afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = -1;
+
+ local = frame->local;
+ child_index = (long)cookie;
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ local->cont.lk.ret_flock = *lock;
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int child_index = (long)cookie;
+
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "gfid=%s: unlock failed on subvolume %s "
+ "with lock owner %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
+ return 0;
+}
+int
+afr_lk_transaction(void *opaque)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ char *wind_on = NULL;
+ int op_errno = 0;
+ int i = 0;
+ int ret = 0;
+
+ frame = (call_frame_t *)opaque;
+ local = frame->local;
+ this = frame->this;
+ priv = this->private;
+ wind_on = alloca0(priv->child_count);
+
+ if (priv->arbiter_count || priv->child_count != 3) {
+ op_errno = ENOTSUP;
+ gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Lock healing supported only for replica 3 volumes.",
+ uuid_utoa(local->fd->inode->gfid));
+ goto err;
+ }
+
+ op_errno = -afr_dom_lock_acquire(frame); // Released during
+ // AFR_STACK_UNWIND
+ if (op_errno != 0) {
+ goto err;
+ }
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) {
+ op_errno = afr_final_errno(local, priv);
+ goto err;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i])
+ wind_on[i] = 1;
+ }
+ AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd,
+ local->cont.lk.cmd, &local->cont.lk.user_flock,
+ local->xdata_req);
+
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ goto unlock;
+ } else {
+ if (local->cont.lk.user_flock.l_type == F_UNLCK)
+ ret = afr_remove_lock_from_saved_locks(local, this);
+ else
+ ret = afr_add_lock_to_saved_locks(frame, this);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ goto unlock;
+ }
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+ &local->cont.lk.ret_flock, local->xdata_rsp);
+ }
+
+ return 0;
+
+unlock:
+ local->cont.lk.user_flock.l_type = F_UNLCK;
+ AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk,
+ local->fd, F_SETLK, &local->cont.lk.user_flock, NULL);
+err:
+ AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+ return -1;
+}
+
+int
afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
struct gf_flock *flock, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
+ int ret = 0;
int i = 0;
int32_t op_errno = ENOMEM;
@@ -4576,9 +5310,11 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
goto out;
local->op = GF_FOP_LK;
- if (!afr_lk_is_unlock(cmd, flock) &&
- !afr_is_consistent_io_possible(local, priv, &op_errno))
- goto out;
+ if (!afr_lk_is_unlock(cmd, flock)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+ }
local->cont.lk.locked_nodes = GF_CALLOC(
priv->child_count, sizeof(*local->cont.lk.locked_nodes),
@@ -4596,6 +5332,16 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
if (xdata)
local->xdata_req = dict_ref(xdata);
+ if (afr_is_lock_mode_mandatory(xdata)) {
+ ret = synctask_new(this->ctx->env, afr_lk_transaction,
+ afr_lk_transaction_cbk, frame, frame);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ return 0;
+ }
+
STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i],
priv->children[i]->fops->lk, fd, cmd, flock,
local->xdata_req);
@@ -5593,6 +6339,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
__afr_handle_child_up_event(this, child_xlator, idx,
child_latency_msec, &event,
&call_psh, &up_child);
+ __afr_lock_heal_synctask(this, priv, idx);
break;
case GF_EVENT_CHILD_DOWN:
@@ -5606,6 +6353,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
__afr_handle_child_down_event(this, child_xlator, idx,
child_latency_msec, &event,
&call_psh, &up_child);
+ __afr_mark_pending_lk_heal(this, priv, idx);
break;
case GF_EVENT_CHILD_CONNECTING:
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index e8894a6..c552170 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -302,6 +302,7 @@ afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
afr_local_t *local = NULL;
int op_errno = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -1698,6 +1699,7 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
int32_t op_errno = 0;
fop_fgetxattr_cbk_t cbk = NULL;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -1791,6 +1793,7 @@ afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
afr_local_t *local = NULL;
int32_t op_errno = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -1866,6 +1869,7 @@ afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
afr_local_t *local = NULL;
int32_t op_errno = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 9acb4d0..a3d2150 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -491,6 +491,7 @@ afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
int op_errno = ENOMEM;
int ret = -1;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -730,6 +731,7 @@ afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -940,6 +942,7 @@ afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -1690,6 +1693,7 @@ afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -1898,6 +1902,7 @@ afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -1998,6 +2003,7 @@ afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2107,6 +2113,7 @@ afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2213,6 +2220,7 @@ afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2412,6 +2420,7 @@ afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2507,6 +2516,7 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
int ret = -1;
int32_t op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index b0fb006..816065f 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -31,6 +31,8 @@ enum gf_afr_mem_types_ {
gf_afr_mt_empty_brick_t,
gf_afr_mt_child_latency_t,
gf_afr_mt_atomic_t,
+ gf_afr_mt_lk_heal_info_t,
+ gf_afr_mt_gf_lock,
gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
index c9c9927..8e59c51 100644
--- a/xlators/cluster/afr/src/afr-messages.h
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -42,6 +42,6 @@ GLFS_MSGID(AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET,
AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS,
AFR_MSG_NO_CHANGELOG, AFR_MSG_TIMER_CREATE_FAIL,
AFR_MSG_SBRAIN_FAV_CHILD_POLICY, AFR_MSG_INODE_CTX_GET_FAILED,
- AFR_MSG_THIN_ARB);
+ AFR_MSG_THIN_ARB, AFR_MSG_LK_HEAL_DOM);
#endif /* !_AFR_MESSAGES_H_ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index f8db3c5..13b5ca2 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -417,6 +417,8 @@ init(xlator_t *this)
goto out;
priv = this->private;
+ INIT_LIST_HEAD(&priv->saved_locks);
+ INIT_LIST_HEAD(&priv->lk_healq);
LOCK_INIT(&priv->lock);
child_count = xlator_subvolume_count(this);
@@ -684,6 +686,7 @@ fini(xlator_t *this)
priv = this->private;
afr_selfheal_daemon_fini(this);
+ GF_ASSERT(list_empty(&priv->saved_locks));
LOCK(&priv->lock);
if (priv->timer != NULL) {
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index f86f019..28be839 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -39,6 +39,8 @@
#define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify"
#define AFR_TA_DOM_MODIFY "afr.ta.dom-modify"
+#define AFR_LK_HEAL_DOM "afr.lock-heal.domain"
+
#define AFR_HALO_MAX_LATENCY 99999
#define PFLAG_PENDING (1 << 0)
@@ -95,6 +97,16 @@ typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);
gf_fop_list[local->op], uuid_utoa(local->inode->gfid)); \
} while (0)
+#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label) \
+ do { \
+ afr_fd_ctx_t *__fd_ctx = NULL; \
+ __fd_ctx = afr_fd_ctx_get(__fd, __this); \
+ if (__fd_ctx && __fd_ctx->is_fd_bad) { \
+ __error = EBADF; \
+ goto __label; \
+ } \
+ } while (0)
+
typedef enum {
AFR_READ_POLICY_FIRST_UP,
AFR_READ_POLICY_GFID_HASH,
@@ -143,6 +155,19 @@ struct afr_nfsd {
gf_boolean_t iamnfsd;
};
+typedef struct _afr_lk_heal_info {
+ fd_t *fd;
+ int32_t cmd;
+ struct gf_flock flock;
+ dict_t *xdata_req;
+ unsigned char *locked_nodes;
+ struct list_head pos;
+ gf_lkowner_t lk_owner;
+ pid_t pid;
+ int32_t *child_up_event_gen;
+ int32_t *child_down_event_gen;
+} afr_lk_heal_info_t;
+
typedef struct _afr_private {
gf_lock_t lock; /* to guard access to child_count, etc */
unsigned int child_count; /* total number of children */
@@ -249,6 +274,10 @@ typedef struct _afr_private {
gf_boolean_t esh_granular;
gf_boolean_t consistent_io;
gf_boolean_t data_self_heal; /* on/off */
+
+ /*For lock healing.*/
+ struct list_head saved_locks;
+ struct list_head lk_healq;
} afr_private_t;
typedef enum {
@@ -371,6 +400,10 @@ typedef struct {
arrives, we continue to read off this subvol.
*/
int readdir_subvol;
+ /* lock-healing related members. */
+ gf_boolean_t is_fd_bad;
+ afr_lk_heal_info_t *lk_heal_info;
+
} afr_fd_ctx_t;
typedef enum {
@@ -572,6 +605,11 @@ typedef struct _afr_local {
struct gf_flock ret_flock;
unsigned char *locked_nodes;
int32_t cmd;
+ /*For lock healing only.*/
+ unsigned char *dom_locked_nodes;
+ int32_t *dom_lock_op_ret;
+ int32_t *dom_lock_op_errno;
+ struct gf_flock *getlk_rsp;
} lk;
/* inode read */
@@ -1074,6 +1112,8 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
if (__local && __local->is_read_txn) \
afr_pending_read_decrement(__this->private, \
__local->read_subvol); \
+ if (__local && afr_is_lock_mode_mandatory(__local->xdata_req)) \
+ afr_dom_lock_release(frame); \
frame->local = NULL; \
} \
\
@@ -1354,4 +1394,10 @@ afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv,
void
afr_selfheal_childup(xlator_t *this, afr_private_t *priv);
+
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata);
+
+void
+afr_dom_lock_release(call_frame_t *frame);
#endif /* __AFR_H__ */