From 593150979399f7f11e580591eab4b032bb0228ac Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Wed, 16 Oct 2019 13:06:29 +0530 Subject: afr: lock healing changes Implements lock healing for gluster-block fencing use case. If mandatory lock is enabled: - Add domain lock/unlock to afr_lk fop. - Maintain a list of locks to be healed in afr_private_t. - Add lock to the list if afr_lk(F_SETLK or F_SETLKW) was sucessful. - Remove it from the list during afr_lk(F_UNLCK). - On child_down, mark lock as needing heal on that child. If lock is lost on quorum no. of bricks, remove it from the list and mark fd bad. - For fds marked as bad, fail the subsequent fd based fops. - On parent up, traverse the list and heal the locks IFF the client is the lk owner and has quorum. (shd does not heal any locks). updates: #613 Change-Id: I03c46ceaea30f5e6236d5ec13f71d843d827f1bc Signed-off-by: Ravishankar N --- tests/basic/fencing/afr-lock-heal-advanced.c | 227 ++++++++ tests/basic/fencing/afr-lock-heal-advanced.t | 104 ++++ tests/basic/fencing/afr-lock-heal-basic.c | 182 ++++++ tests/basic/fencing/afr-lock-heal-basic.t | 99 ++++ xlators/cluster/afr/src/afr-common.c | 818 +++++++++++++++++++++++++-- xlators/cluster/afr/src/afr-inode-read.c | 4 + xlators/cluster/afr/src/afr-inode-write.c | 10 + xlators/cluster/afr/src/afr-mem-types.h | 2 + xlators/cluster/afr/src/afr-messages.h | 2 +- xlators/cluster/afr/src/afr.c | 3 + xlators/cluster/afr/src/afr.h | 46 ++ 11 files changed, 1461 insertions(+), 36 deletions(-) create mode 100644 tests/basic/fencing/afr-lock-heal-advanced.c create mode 100644 tests/basic/fencing/afr-lock-heal-advanced.t create mode 100644 tests/basic/fencing/afr-lock-heal-basic.c create mode 100644 tests/basic/fencing/afr-lock-heal-basic.t diff --git a/tests/basic/fencing/afr-lock-heal-advanced.c b/tests/basic/fencing/afr-lock-heal-advanced.c new file mode 100644 index 00000000000..e202ccd5b29 --- /dev/null +++ b/tests/basic/fencing/afr-lock-heal-advanced.c @@ -0,0 +1,227 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define GF_ENFORCE_MANDATORY_LOCK "trusted.glusterfs.enforce-mandatory-lock" + +FILE *logfile_fp; + +#define LOG_ERR(func, err) \ + do { \ + if (!logfile_fp) { \ + fprintf(stderr, "%\n%d %s : returned error (%s)\n", __LINE__, \ + func, strerror(err)); \ + fflush(stderr); \ + } else { \ + fprintf(logfile_fp, "\n%d %s : returned error (%s)\n", __LINE__, \ + func, strerror(err)); \ + fflush(logfile_fp); \ + } \ + } while (0) + +glfs_t * +setup_client(char *hostname, char *volname, char *log_file) +{ + int ret = 0; + glfs_t *fs = NULL; + + fs = glfs_new(volname); + if (!fs) { + fprintf(logfile_fp, "\nglfs_new: returned NULL (%s)\n", + strerror(errno)); + goto error; + } + + ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007); + if (ret < 0) { + fprintf(logfile_fp, "\nglfs_set_volfile_server failed ret:%d (%s)\n", + ret, strerror(errno)); + goto error; + } + + ret = glfs_set_logging(fs, log_file, 7); + if (ret < 0) { + fprintf(logfile_fp, "\nglfs_set_logging failed with ret: %d (%s)\n", + ret, strerror(errno)); + goto error; + } + + ret = glfs_init(fs); + if (ret < 0) { + fprintf(logfile_fp, "\nglfs_init failed with ret: %d (%s)\n", ret, + strerror(errno)); + goto error; + } + +out: + return fs; +error: + return NULL; +} + +glfs_fd_t * +open_file(glfs_t *fs, char *fname) +{ + glfs_fd_t *fd = NULL; + + fd = glfs_creat(fs, fname, O_CREAT, 0644); + if (!fd) { + LOG_ERR("glfs_creat", errno); + goto out; + } +out: + return fd; +} + +int +acquire_mandatory_lock(glfs_t *fs, glfs_fd_t *fd) +{ + struct flock lock; + int ret = 0; + + /* initialize lock */ + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 100; + + ret = glfs_fsetxattr(fd, GF_ENFORCE_MANDATORY_LOCK, "set", 8, 0); + if (ret < 0) { + LOG_ERR("glfs_fsetxattr", errno); + ret = -1; + goto out; + } + + /* take a write mandatory lock */ + ret = glfs_file_lock(fd, F_SETLKW, &lock, GLFS_LK_MANDATORY); + if (ret) { + LOG_ERR("glfs_file_lock", errno); + ret = -1; + goto out; + } + +out: + return ret; +} + +int +perform_test(glfs_t *fs, char *file1, char *file2) +{ + int ret = 0; + glfs_fd_t *fd1 = NULL; + glfs_fd_t *fd2 = NULL; + char *buf = "0123456789"; + + fd1 = open_file(fs, file1); + if (!fd1) { + ret = -1; + goto out; + } + fd2 = open_file(fs, file2); + if (!fd2) { + ret = -1; + goto out; + } + + /* Kill one brick from the .t.*/ + pause(); + + ret = acquire_mandatory_lock(fs, fd1); + if (ret) { + goto out; + } + ret = acquire_mandatory_lock(fs, fd2); + if (ret) { + goto out; + } + + /* Bring the brick up and let the locks heal. */ + pause(); + /*At this point, the .t would have killed and brought back 2 bricks, marking + * the fd bad.*/ + + ret = glfs_write(fd1, buf, 10, 0); + if (ret > 0) { + /* Write is supposed to fail with EBADFD*/ + LOG_ERR("glfs_write", ret); + goto out; + } + + ret = 0; +out: + if (fd1) + glfs_close(fd1); + if (fd2) + glfs_close(fd2); + return ret; +} + +static void +sigusr1_handler(int signo) +{ + /*Signal caught. Just continue with the execution.*/ +} + +int +main(int argc, char *argv[]) +{ + int ret = 0; + glfs_t *fs = NULL; + char *volname = NULL; + char log_file[100]; + char *hostname = NULL; + char *fname1 = NULL; + char *fname2 = NULL; + + if (argc != 7) { + fprintf(stderr, + "Expect following args %s " + " \n", + argv[0]); + return -1; + } + + hostname = argv[1]; + volname = argv[2]; + fname1 = argv[3]; + fname2 = argv[4]; + + /*Use SIGUSR1 and pause()as a means of hitting break-points this program + *when signalled from the .t test case.*/ + if (signal(SIGUSR1, sigusr1_handler) == SIG_ERR) { + LOG_ERR("SIGUSR1 handler error", errno); + exit(EXIT_FAILURE); + } + + sprintf(log_file, "%s/%s.%s.%s", argv[5], "lock-heal.c", argv[6], "log"); + logfile_fp = fopen(log_file, "w"); + if (!logfile_fp) { + fprintf(stderr, "\nfailed to open %s\n", log_file); + fflush(stderr); + return -1; + } + + sprintf(log_file, "%s/%s.%s.%s", argv[5], "glfs-client", argv[6], "log"); + fs = setup_client(hostname, volname, log_file); + if (!fs) { + LOG_ERR("setup_client", errno); + return -1; + } + + ret = perform_test(fs, fname1, fname2); + +error: + if (fs) { + /*glfs_fini(fs)*/; // glfs fini path is racy and crashes the program + } + + fclose(logfile_fp); + + return ret; +} diff --git a/tests/basic/fencing/afr-lock-heal-advanced.t b/tests/basic/fencing/afr-lock-heal-advanced.t new file mode 100644 index 00000000000..8a7a208db29 --- /dev/null +++ b/tests/basic/fencing/afr-lock-heal-advanced.t @@ -0,0 +1,104 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +function is_gfapi_program_alive() +{ + pid=$1 + ps -p $pid + if [ $? -eq 0 ] + then + echo "Y" + else + echo "N" + fi +} + +function get_active_lock_count { + brick=$1 + sdump=$(generate_brick_statedump $V0 $H0 $brick) + lock_count="$(grep ACTIVE $sdump| wc -l)" + echo "$lock_count" +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +EXPECT 'Created' volinfo_field $V0 'Status'; +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.open-behind off +TEST $CLI volume set $V0 locks.mandatory-locking forced +TEST $CLI volume set $V0 enforce-mandatory-lock on +TEST $CLI volume start $V0; +EXPECT 'Started' volinfo_field $V0 'Status'; + +logdir=`gluster --print-logdir` +TEST build_tester $(dirname $0)/afr-lock-heal-advanced.c -lgfapi -ggdb + +#------------------------------------------------------------------------------ +# Use more than 1 fd from same client so that list_for_each_* loops are executed more than once. +$(dirname $0)/afr-lock-heal-advanced $H0 $V0 "/FILE1" "/FILE2" $logdir C1& +client_pid=$! +TEST [ $client_pid ] + +TEST sleep 5 # By now, the client would have opened an fd on FILE1 and FILE2 and waiting for a SIGUSR1. +EXPECT "Y" is_gfapi_program_alive $client_pid + +# Kill brick-3 and let client-1 take lock on both files. +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST kill -SIGUSR1 $client_pid +# If program is still alive, glfs_file_lock() was a success. +EXPECT "Y" is_gfapi_program_alive $client_pid + +# Check lock is present on brick-1 and brick-2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}1 + +# Restart brick-3 and check that the lock has healed on it. +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd. + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}2 + +#------------------------------------------------------------------------------ +# Kill same brick before heal completes the first time and check it completes the second time. +TEST $CLI volume set $V0 delay-gen locks +TEST $CLI volume set $V0 delay-gen.delay-duration 5000000 +TEST $CLI volume set $V0 delay-gen.delay-percentage 100 +TEST $CLI volume set $V0 delay-gen.enable finodelk + +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST $CLI volume reset $V0 delay-gen +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}0 + +#------------------------------------------------------------------------------ +# Kill 2 bricks and bring it back. The fds must be marked bad. +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 + +# TODO: `gluster v statedump $V0 client localhost:$client_pid` is not working, +# so sleep for 20 seconds for the client to connect to connect to the bricks. +TEST sleep $CHILD_UP_TIMEOUT + +# Try to write to FILE1 from the .c; it must fail. +TEST kill -SIGUSR1 $client_pid +wait $client_pid +ret=$? +TEST [ $ret == 0 ] + +cleanup_tester $(dirname $0)/afr-lock-heal-advanced +cleanup; diff --git a/tests/basic/fencing/afr-lock-heal-basic.c b/tests/basic/fencing/afr-lock-heal-basic.c new file mode 100644 index 00000000000..768c9e57181 --- /dev/null +++ b/tests/basic/fencing/afr-lock-heal-basic.c @@ -0,0 +1,182 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define GF_ENFORCE_MANDATORY_LOCK "trusted.glusterfs.enforce-mandatory-lock" + +FILE *logfile_fp; + +#define LOG_ERR(func, err) \ + do { \ + if (!logfile_fp) { \ + fprintf(stderr, "%\n%d %s : returned error (%s)\n", __LINE__, \ + func, strerror(err)); \ + fflush(stderr); \ + } else { \ + fprintf(logfile_fp, "\n%d %s : returned error (%s)\n", __LINE__, \ + func, strerror(err)); \ + fflush(logfile_fp); \ + } \ + } while (0) + +glfs_t * +setup_client(char *hostname, char *volname, char *log_file) +{ + int ret = 0; + glfs_t *fs = NULL; + + fs = glfs_new(volname); + if (!fs) { + fprintf(logfile_fp, "\nglfs_new: returned NULL (%s)\n", + strerror(errno)); + goto error; + } + + ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007); + if (ret < 0) { + fprintf(logfile_fp, "\nglfs_set_volfile_server failed ret:%d (%s)\n", + ret, strerror(errno)); + goto error; + } + + ret = glfs_set_logging(fs, log_file, 7); + if (ret < 0) { + fprintf(logfile_fp, "\nglfs_set_logging failed with ret: %d (%s)\n", + ret, strerror(errno)); + goto error; + } + + ret = glfs_init(fs); + if (ret < 0) { + fprintf(logfile_fp, "\nglfs_init failed with ret: %d (%s)\n", ret, + strerror(errno)); + goto error; + } + +out: + return fs; +error: + return NULL; +} + +int +acquire_mandatory_lock(glfs_t *fs, char *fname) +{ + struct flock lock; + int ret = 0; + glfs_fd_t *fd = NULL; + + fd = glfs_creat(fs, fname, O_CREAT, 0644); + if (!fd) { + if (errno != EEXIST) { + LOG_ERR("glfs_creat", errno); + ret = -1; + goto out; + } + fd = glfs_open(fs, fname, O_RDWR | O_NONBLOCK); + if (!fd) { + LOG_ERR("glfs_open", errno); + ret = -1; + goto out; + } + } + + /* initialize lock */ + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 100; + + ret = glfs_fsetxattr(fd, GF_ENFORCE_MANDATORY_LOCK, "set", 8, 0); + if (ret < 0) { + LOG_ERR("glfs_fsetxattr", errno); + ret = -1; + goto out; + } + + pause(); + + /* take a write mandatory lock */ + ret = glfs_file_lock(fd, F_SETLKW, &lock, GLFS_LK_MANDATORY); + if (ret) { + LOG_ERR("glfs_file_lock", errno); + goto out; + } + + pause(); + +out: + if (fd) { + glfs_close(fd); + } + + return ret; +} + +static void +sigusr1_handler(int signo) +{ + /*Signal caught. Just continue with the execution.*/ +} + +int +main(int argc, char *argv[]) +{ + int ret = 0; + glfs_t *fs = NULL; + char *volname = NULL; + char log_file[100]; + char *hostname = NULL; + char *fname = NULL; + + if (argc != 6) { + fprintf(stderr, + "Expect following args %s \n", + argv[0]); + return -1; + } + + hostname = argv[1]; + volname = argv[2]; + fname = argv[3]; + + /*Use SIGUSR1 and pause()as a means of hitting break-points this program + *when signalled from the .t test case.*/ + if (signal(SIGUSR1, sigusr1_handler) == SIG_ERR) { + LOG_ERR("SIGUSR1 handler error", errno); + exit(EXIT_FAILURE); + } + + sprintf(log_file, "%s/%s.%s.%s", argv[4], "lock-heal-basic.c", argv[5], + "log"); + logfile_fp = fopen(log_file, "w"); + if (!logfile_fp) { + fprintf(stderr, "\nfailed to open %s\n", log_file); + fflush(stderr); + return -1; + } + + sprintf(log_file, "%s/%s.%s.%s", argv[4], "glfs-client", argv[5], "log"); + fs = setup_client(hostname, volname, log_file); + if (!fs) { + LOG_ERR("setup_client", errno); + return -1; + } + + ret = acquire_mandatory_lock(fs, fname); + +error: + if (fs) { + /*glfs_fini(fs)*/; // glfs fini path is racy and crashes the program + } + + fclose(logfile_fp); + + return ret; +} diff --git a/tests/basic/fencing/afr-lock-heal-basic.t b/tests/basic/fencing/afr-lock-heal-basic.t new file mode 100644 index 00000000000..5ac05c7aec6 --- /dev/null +++ b/tests/basic/fencing/afr-lock-heal-basic.t @@ -0,0 +1,99 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +function is_gfapi_program_alive() +{ + pid=$1 + ps -p $pid + if [ $? -eq 0 ] + then + echo "Y" + else + echo "N" + fi +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +EXPECT 'Created' volinfo_field $V0 'Status'; +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.open-behind off +TEST $CLI volume set $V0 locks.mandatory-locking forced +TEST $CLI volume set $V0 enforce-mandatory-lock on +TEST $CLI volume start $V0; +EXPECT 'Started' volinfo_field $V0 'Status'; + +logdir=`gluster --print-logdir` +TEST build_tester $(dirname $0)/afr-lock-heal-basic.c -lgfapi -ggdb + +$(dirname $0)/afr-lock-heal-basic $H0 $V0 "/FILE" $logdir C1& +client1_pid=$! +TEST [ $client1_pid ] + +$(dirname $0)/afr-lock-heal-basic $H0 $V0 "/FILE" $logdir C2& +client2_pid=$! +TEST [ $client2_pid ] + +TEST sleep 5 # By now, the 2 clients would have opened an fd on FILE and waiting for a SIGUSR1. +EXPECT "Y" is_gfapi_program_alive $client1_pid +EXPECT "Y" is_gfapi_program_alive $client2_pid + +# Kill brick-3 and let client-1 take lock on the file. +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST kill -SIGUSR1 $client1_pid +# If program is still alive, glfs_file_lock() was a success. +EXPECT "Y" is_gfapi_program_alive $client1_pid + +# Check lock is present on brick-1 and brick-2 +b1_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}0) +b2_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}1) +c1_lock_on_b1="$(grep ACTIVE $b1_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +c1_lock_on_b2="$(grep ACTIVE $b2_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +TEST [ "$c1_lock_on_b1" == "$c1_lock_on_b2" ] + +# Restart brick-3 and check that the lock has healed on it. +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd. Also wait for lock heal. + +b3_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}2) +c1_lock_on_b3="$(grep ACTIVE $b3_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +TEST [ "$c1_lock_on_b1" == "$c1_lock_on_b3" ] + +# Kill brick-1 and let client-2 preempt the lock on bricks 2 and 3. +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST kill -SIGUSR1 $client2_pid +# If program is still alive, glfs_file_lock() was a success. +EXPECT "Y" is_gfapi_program_alive $client2_pid + +# Restart brick-1 and let lock healing complete. +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd. Also wait for lock heal. + +# Check that all bricks now have locks from client 2 only. +b1_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}0) +b2_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}1) +b3_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}2) +c2_lock_on_b1="$(grep ACTIVE $b1_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +c2_lock_on_b2="$(grep ACTIVE $b2_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +c2_lock_on_b3="$(grep ACTIVE $b3_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +TEST [ "$c2_lock_on_b1" == "$c2_lock_on_b2" ] +TEST [ "$c2_lock_on_b1" == "$c2_lock_on_b3" ] +TEST [ "$c2_lock_on_b1" != "$c1_lock_on_b1" ] + +#Let the client programs run and exit. +TEST kill -SIGUSR1 $client1_pid +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "N" is_gfapi_program_alive $client1_pid +TEST kill -SIGUSR1 $client2_pid +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "N" is_gfapi_program_alive $client2_pid + +cleanup_tester $(dirname $0)/afr-lock-heal-basic +cleanup; diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 4b22af7cb3f..07bf53a1941 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -45,6 +45,21 @@ afr_quorum_errno(afr_private_t *priv) return ENOTCONN; } +static void +afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies) +{ + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + replies[i] = 1; + } else { + replies[i] = 0; + } + } +} + int afr_fav_child_reset_sink_xattrs(void *opaque); @@ -54,6 +69,581 @@ afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque); static void afr_discover_done(call_frame_t *frame, xlator_t *this); +int +afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; + + local->cont.lk.dom_lock_op_ret[i] = op_ret; + local->cont.lk.dom_lock_op_errno[i] = op_errno; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to acquire %s on %s", + uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM, + priv->children[i]->name); + } else { + local->cont.lk.dom_locked_nodes[i] = 1; + } + + syncbarrier_wake(&local->barrier); + + return 0; +} + +int +afr_dom_lock_acquire(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = { + 0, + }; + int i = 0; + + priv = frame->this->private; + local = frame->local; + local->cont.lk.dom_locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.locked_nodes), + gf_afr_mt_char); + if (!local->cont.lk.dom_locked_nodes) { + return -ENOMEM; + } + local->cont.lk.dom_lock_op_ret = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_ret) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + local->cont.lk.dom_lock_op_errno = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_errno) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + flock.l_type = F_WRLCK; + + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLK, &flock, NULL); + + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) + goto blocking_lock; + + /*If any of the bricks returned EAGAIN, we still need blocking locks.*/ + if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) != + priv->child_count) { + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.dom_lock_op_ret[i] == -1 && + local->cont.lk.dom_lock_op_errno[i] == EAGAIN) + goto blocking_lock; + } + } + + return 0; + +blocking_lock: + afr_dom_lock_release(frame); + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLKW, &flock, NULL); + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) { + afr_dom_lock_release(frame); + return -afr_quorum_errno(priv); + } + + return 0; +} + +int +afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to release %s on %s", local->loc.path, + AFR_LK_HEAL_DOM, priv->children[i]->name); + } + local->cont.lk.dom_locked_nodes[i] = 0; + + syncbarrier_wake(&local->barrier); + + return 0; +} + +void +afr_dom_lock_release(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + struct gf_flock flock = { + 0, + }; + + local = frame->local; + priv = frame->this->private; + locked_on = local->cont.lk.dom_locked_nodes; + if (AFR_COUNT(locked_on, priv->child_count) == 0) + return; + flock.l_type = F_UNLCK; + + AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk, + AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL); + + return; +} + +static void +afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info) +{ + if (!info) + return; + if (info->xdata_req) + dict_unref(info->xdata_req); + if (info->fd) + fd_unref(info->fd); + GF_FREE(info->locked_nodes); + GF_FREE(info->child_up_event_gen); + GF_FREE(info->child_down_event_gen); + GF_FREE(info); +} + +static int +afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -ENOMEM; + + info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t); + if (!info) { + goto cleanup; + } + INIT_LIST_HEAD(&info->pos); + info->fd = fd_ref(local->fd); + info->cmd = local->cont.lk.cmd; + info->pid = frame->root->pid; + info->flock = local->cont.lk.user_flock; + info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL); + if (!info->xdata_req) { + goto cleanup; + } + info->lk_owner = frame->root->lk_owner; + info->locked_nodes = GF_MALLOC( + sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char); + if (!info->locked_nodes) { + goto cleanup; + } + memcpy(info->locked_nodes, local->cont.lk.locked_nodes, + sizeof(*info->locked_nodes) * priv->child_count); + info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen), + priv->child_count, gf_afr_mt_int32_t); + if (!info->child_up_event_gen) { + goto cleanup; + } + info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen), + priv->child_count, + gf_afr_mt_int32_t); + if (!info->child_down_event_gen) { + goto cleanup; + } + + LOCK(&local->fd->lock); + { + fd_ctx = __afr_fd_ctx_get(local->fd, this); + if (fd_ctx) + fd_ctx->lk_heal_info = info; + } + UNLOCK(&local->fd->lock); + if (!fd_ctx) { + goto cleanup; + } + + LOCK(&priv->lock); + { + list_add_tail(&info->pos, &priv->saved_locks); + } + UNLOCK(&priv->lock); + + return 0; +cleanup: + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to add lock to healq", + uuid_utoa(local->fd->inode->gfid)); + if (info) { + afr_lk_heal_info_cleanup(info); + if (fd_ctx) { + LOCK(&local->fd->lock); + { + fd_ctx->lk_heal_info = NULL; + } + UNLOCK(&local->fd->lock); + } + } + return ret; +} + +static int +afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = this->private; + struct gf_flock flock = local->cont.lk.user_flock; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -EINVAL; + + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx || !fd_ctx->lk_heal_info) { + goto out; + } + + info = fd_ctx->lk_heal_info; + if ((info->flock.l_start != flock.l_start) || + (info->flock.l_whence != flock.l_whence) || + (info->flock.l_len != flock.l_len)) { + /*TODO: Compare lkowners too.*/ + goto out; + } + + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + UNLOCK(&priv->lock); + + afr_lk_heal_info_cleanup(info); + fd_ctx->lk_heal_info = NULL; + ret = 0; +out: + if (ret) + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to remove lock from healq", + uuid_utoa(local->fd->inode->gfid)); + return ret; +} + +int +afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed to heal lock on child %d for %s", i, + uuid_utoa(local->fd->inode->gfid)); + } + syncbarrier_wake(&local->barrier); + return 0; +} + +int +afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid)); + } else { + local->cont.lk.getlk_rsp[i] = *lock; + } + + syncbarrier_wake(&local->barrier); + return 0; +} + +static gf_boolean_t +afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + afr_local_t *local = frame->local; + struct gf_flock flock = { + 0, + }; + gf_boolean_t ret = _gf_true; + char *wind_on = alloca0(priv->child_count); + unsigned char *success_replies = alloca0(priv->child_count); + local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp), + priv->child_count, gf_afr_mt_gf_lock); + + flock = info->flock; + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + wind_on[i] = 1; + } + + AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock, + info->xdata_req); + + afr_fill_success_replies(local, priv, success_replies); + if (AFR_COUNT(success_replies, priv->child_count) == 0) { + ret = _gf_false; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || local->replies[i].op_ret != 0) + continue; + if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK) + continue; + /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/ + if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner, + &info->lk_owner)) { + ret = _gf_false; + break; + } + } +out: + afr_local_replies_wipe(local, priv); + GF_FREE(local->cont.lk.getlk_rsp); + local->cont.lk.getlk_rsp = NULL; + return ret; +} + +static void +afr_mark_fd_bad(fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + if (!fd) + return; + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get(fd, this); + if (fd_ctx) { + fd_ctx->is_fd_bad = _gf_true; + fd_ctx->lk_heal_info = NULL; + } + } + UNLOCK(&fd->lock); +} + +static void +afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info) +{ + LOCK(&priv->lock); + { + list_del(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } + UNLOCK(&priv->lock); +} + +static void +afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + int op_errno = 0; + int32_t *current_event_gen = NULL; + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + char *wind_on = alloca0(priv->child_count); + gf_boolean_t retry = _gf_true; + + frame->root->pid = info->pid; + lk_owner_copy(&frame->root->lk_owner, &info->lk_owner); + + op_errno = -afr_dom_lock_acquire(frame); + if ((op_errno != 0)) { + goto release; + } + + if (!afr_does_lk_owner_match(frame, priv, info)) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM, + "Ignoring lock heal for %s since lk-onwers mismatch. " + "Lock possibly pre-empted by another client.", + uuid_utoa(info->fd->inode->gfid)); + goto release; + } + + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + continue; + wind_on[i] = 1; + } + + current_event_gen = alloca(priv->child_count); + memcpy(current_event_gen, info->child_up_event_gen, + priv->child_count * sizeof *current_event_gen); + AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd, + &info->flock, info->xdata_req); + + LOCK(&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) { + continue; + } + + if ((current_event_gen[i] == info->child_up_event_gen[i]) && + (current_event_gen[i] > info->child_down_event_gen[i])) { + info->locked_nodes[i] = 1; + retry = _gf_false; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->saved_locks); + } else { + /*We received subsequent child up/down events while heal was in + * progress; don't mark child as healed. Attempt again on the + * new child up*/ + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM, + "Event gen mismatch: skipped healing lock on child %d " + "for %s.", + i, uuid_utoa(info->fd->inode->gfid)); + } + } + } + UNLOCK(&priv->lock); + +release: + afr_dom_lock_release(frame); + if (retry) + afr_add_lock_to_lkhealq(priv, info); + return; +} + +static int +afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque) +{ + STACK_DESTROY(frame->root); + return 0; +} + +static int +afr_lock_heal(void *opaque) +{ + call_frame_t *frame = (call_frame_t *)opaque; + call_frame_t *iter_frame = NULL; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + struct list_head healq = { + 0, + }; + int ret = 0; + + iter_frame = afr_copy_frame(frame); + if (!iter_frame) { + return ENOMEM; + } + + INIT_LIST_HEAD(&healq); + LOCK(&priv->lock); + { + list_splice_init(&priv->lk_healq, &healq); + } + UNLOCK(&priv->lock); + + list_for_each_entry_safe(info, tmp, &healq, pos) + { + GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) < + priv->child_count)); + ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd); + afr_lock_heal_do(iter_frame, priv, info); + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = ENOTCONN; + gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN, + AFR_MSG_LK_HEAL_DOM, + "Aborting processing of lk_healq." + "Healing will be reattempted on next child up for locks " + "that are still in quorum."); + LOCK(&priv->lock); + { + list_add_tail(&healq, &priv->lk_healq); + } + UNLOCK(&priv->lock); + break; + } + } + + AFR_STACK_DESTROY(iter_frame); + return ret; +} + +static int +__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child) +{ + int ret = 0; + call_frame_t *frame = NULL; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + + if (priv->shd.iamshd) + return 0; + + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_up_event_gen[child] = priv->event_generation; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } + + frame = create_frame(this, this->ctx->pool); + if (!frame) + return -1; + + ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame, + frame); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM, + "Failed to launch lock heal synctask"); + + return ret; +} + +static int +__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child) +{ + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + + if (priv->shd.iamshd) + return 0; + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_down_event_gen[child] = priv->event_generation; + if (info->locked_nodes[child] == 1) + info->locked_nodes[child] = 0; + if (!afr_has_quorum(info->locked_nodes, this, NULL)) { + /* Since the lock was lost on quorum no. of nodes, we should + * not attempt to heal it anymore. Some other client could have + * acquired the lock, modified data and released it and this + * client wouldn't know about it if we heal it.*/ + afr_mark_fd_bad(info->fd, this); + list_del(&info->pos); + afr_lk_heal_info_cleanup(info); + /* We're not winding an unlock on the node where the lock is still + * present because when fencing logic switches over to the new + * client (since we marked the fd bad), it should preempt any + * existing lock. */ + } + } + return 0; +} + gf_boolean_t afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) @@ -68,6 +658,19 @@ afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv, return _gf_true; } +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata) +{ + int ret = 0; + uint32_t lk_mode = GF_LK_ADVISORY; + + ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode); + if (!ret && lk_mode == GF_LK_MANDATORY) + return _gf_true; + + return _gf_false; +} + call_frame_t * afr_copy_frame(call_frame_t *base) { @@ -1224,18 +1827,6 @@ refresh_done: return 0; } -static void -afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, - unsigned char *replies) -{ - int i = 0; - - for (i = 0; i < priv->child_count; i++) { - if (local->replies[i].valid && local->replies[i].op_ret == 0) - replies[i] = 1; - } -} - int afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error) { @@ -2049,6 +2640,9 @@ afr_local_cleanup(afr_local_t *local, xlator_t *this) { /* lk */ GF_FREE(local->cont.lk.locked_nodes); + GF_FREE(local->cont.lk.dom_locked_nodes); + GF_FREE(local->cont.lk.dom_lock_op_ret); + GF_FREE(local->cont.lk.dom_lock_op_errno); } { /* create */ @@ -3451,8 +4045,18 @@ out: } void -_afr_cleanup_fd_ctx(afr_fd_ctx_t *fd_ctx) +_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx) { + afr_private_t *priv = this->private; + + if (fd_ctx->lk_heal_info) { + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info); + fd_ctx->lk_heal_info = NULL; + } GF_FREE(fd_ctx->opened_on); GF_FREE(fd_ctx); return; @@ -3472,7 +4076,7 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long)ctx; if (fd_ctx) { - _afr_cleanup_fd_ctx(fd_ctx); + _afr_cleanup_fd_ctx(this, fd_ctx); } out: @@ -3565,13 +4169,14 @@ __afr_fd_ctx_set(xlator_t *this, fd_t *fd) } fd_ctx->readdir_subvol = -1; + fd_ctx->lk_heal_info = NULL; ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx); if (ret) gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd); out: if (ret && fd_ctx) - _afr_cleanup_fd_ctx(fd_ctx); + _afr_cleanup_fd_ctx(this, fd_ctx); return ret; } @@ -3694,6 +4299,7 @@ afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) call_stub_t *stub = NULL; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -4230,9 +4836,9 @@ out: } static int32_t -afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, - loc_t *loc, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) { afr_local_t *local = NULL; int32_t op_errno = ENOMEM; @@ -4244,8 +4850,10 @@ afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, local->op = fop; if (loc) loc_copy(&local->loc, loc); - if (fd) + if (fd && (flock->l_type != F_UNLCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local->fd = fd_ref(fd); + } local->cont.inodelk.volume = gf_strdup(volume); if (!local->cont.inodelk.volume) { @@ -4274,8 +4882,8 @@ int32_t afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - afr_handle_inodelk(frame, GF_FOP_INODELK, volume, loc, NULL, cmd, flock, - xdata); + afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd, + flock, xdata); return 0; } @@ -4283,15 +4891,16 @@ int32_t afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - afr_handle_inodelk(frame, GF_FOP_FINODELK, volume, NULL, fd, cmd, flock, - xdata); + afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd, + flock, xdata); return 0; } static int -afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, - loc_t *loc, fd_t *fd, const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata) +afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_local_t *local = NULL; int32_t op_errno = ENOMEM; @@ -4303,8 +4912,10 @@ afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, local->op = fop; if (loc) loc_copy(&local->loc, loc); - if (fd) + if (fd && (cmd != ENTRYLK_UNLOCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local->fd = fd_ref(fd); + } local->cont.entrylk.cmd = cmd; local->cont.entrylk.in_cmd = cmd; local->cont.entrylk.type = type; @@ -4331,8 +4942,8 @@ afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - afr_handle_entrylk(frame, GF_FOP_ENTRYLK, volume, loc, NULL, basename, cmd, - type, xdata); + afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename, + cmd, type, xdata); return 0; } @@ -4341,8 +4952,8 @@ afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - afr_handle_entrylk(frame, GF_FOP_FENTRYLK, volume, NULL, fd, basename, cmd, - type, xdata); + afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename, + cmd, type, xdata); return 0; } @@ -4460,9 +5071,10 @@ afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, } call_count = afr_frame_return(frame); - if (call_count == 0) + if (call_count == 0) { AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL, local->xdata_rsp); + } return 0; } @@ -4560,12 +5172,134 @@ afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, return 0; } +int +afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque) +{ + return 0; +} + +int +afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = -1; + + local = frame->local; + child_index = (long)cookie; + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.locked_nodes[child_index] = 1; + local->cont.lk.ret_flock = *lock; + } + syncbarrier_wake(&local->barrier); + return 0; +} + +int +afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int child_index = (long)cookie; + + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "gfid=%s: unlock failed on subvolume %s " + "with lock owner %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } + return 0; +} +int +afr_lk_transaction(void *opaque) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + char *wind_on = NULL; + int op_errno = 0; + int i = 0; + int ret = 0; + + frame = (call_frame_t *)opaque; + local = frame->local; + this = frame->this; + priv = this->private; + wind_on = alloca0(priv->child_count); + + if (priv->arbiter_count || priv->child_count != 3) { + op_errno = ENOTSUP; + gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Lock healing supported only for replica 3 volumes.", + uuid_utoa(local->fd->inode->gfid)); + goto err; + } + + op_errno = -afr_dom_lock_acquire(frame); // Released during + // AFR_STACK_UNWIND + if (op_errno != 0) { + goto err; + } + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) { + op_errno = afr_final_errno(local, priv); + goto err; + } + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i]) + wind_on[i] = 1; + } + AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd, + local->cont.lk.cmd, &local->cont.lk.user_flock, + local->xdata_req); + + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + goto unlock; + } else { + if (local->cont.lk.user_flock.l_type == F_UNLCK) + ret = afr_remove_lock_from_saved_locks(local, this); + else + ret = afr_add_lock_to_saved_locks(frame, this); + if (ret) { + local->op_ret = -1; + local->op_errno = -ret; + goto unlock; + } + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, local->xdata_rsp); + } + + return 0; + +unlock: + local->cont.lk.user_flock.l_type = F_UNLCK; + AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk, + local->fd, F_SETLK, &local->cont.lk.user_flock, NULL); +err: + AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); + return -1; +} + int afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; + int ret = 0; int i = 0; int32_t op_errno = ENOMEM; @@ -4576,9 +5310,11 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, goto out; local->op = GF_FOP_LK; - if (!afr_lk_is_unlock(cmd, flock) && - !afr_is_consistent_io_possible(local, priv, &op_errno)) - goto out; + if (!afr_lk_is_unlock(cmd, flock)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + } local->cont.lk.locked_nodes = GF_CALLOC( priv->child_count, sizeof(*local->cont.lk.locked_nodes), @@ -4596,6 +5332,16 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, if (xdata) local->xdata_req = dict_ref(xdata); + if (afr_is_lock_mode_mandatory(xdata)) { + ret = synctask_new(this->ctx->env, afr_lk_transaction, + afr_lk_transaction_cbk, frame, frame); + if (ret) { + op_errno = ENOMEM; + goto out; + } + return 0; + } + STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i], priv->children[i]->fops->lk, fd, cmd, flock, local->xdata_req); @@ -5593,6 +6339,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) __afr_handle_child_up_event(this, child_xlator, idx, child_latency_msec, &event, &call_psh, &up_child); + __afr_lock_heal_synctask(this, priv, idx); break; case GF_EVENT_CHILD_DOWN: @@ -5606,6 +6353,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) __afr_handle_child_down_event(this, child_xlator, idx, child_latency_msec, &event, &call_psh, &up_child); + __afr_mark_pending_lk_heal(this, priv, idx); break; case GF_EVENT_CHILD_CONNECTING: diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index e8894a62620..c5521704de2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -302,6 +302,7 @@ afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) afr_local_t *local = NULL; int op_errno = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -1698,6 +1699,7 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, int32_t op_errno = 0; fop_fgetxattr_cbk_t cbk = NULL; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -1791,6 +1793,7 @@ afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, afr_local_t *local = NULL; int32_t op_errno = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -1866,6 +1869,7 @@ afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, afr_local_t *local = NULL; int32_t op_errno = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 9acb4d0e053..a3d2150efe2 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -491,6 +491,7 @@ afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int op_errno = ENOMEM; int ret = -1; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -730,6 +731,7 @@ afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -940,6 +942,7 @@ afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -1690,6 +1693,7 @@ afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -1898,6 +1902,7 @@ afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -1998,6 +2003,7 @@ afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2107,6 +2113,7 @@ afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2213,6 +2220,7 @@ afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2412,6 +2420,7 @@ afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2507,6 +2516,7 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, int ret = -1; int32_t op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index b0fb00641a0..816065fb57a 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -31,6 +31,8 @@ enum gf_afr_mem_types_ { gf_afr_mt_empty_brick_t, gf_afr_mt_child_latency_t, gf_afr_mt_atomic_t, + gf_afr_mt_lk_heal_info_t, + gf_afr_mt_gf_lock, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h index c9c99270e98..8e59c51b993 100644 --- a/xlators/cluster/afr/src/afr-messages.h +++ b/xlators/cluster/afr/src/afr-messages.h @@ -42,6 +42,6 @@ GLFS_MSGID(AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG, AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, AFR_MSG_INODE_CTX_GET_FAILED, - AFR_MSG_THIN_ARB); + AFR_MSG_THIN_ARB, AFR_MSG_LK_HEAL_DOM); #endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index f8db3c5653f..13b5ca2fce9 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -417,6 +417,8 @@ init(xlator_t *this) goto out; priv = this->private; + INIT_LIST_HEAD(&priv->saved_locks); + INIT_LIST_HEAD(&priv->lk_healq); LOCK_INIT(&priv->lock); child_count = xlator_subvolume_count(this); @@ -684,6 +686,7 @@ fini(xlator_t *this) priv = this->private; afr_selfheal_daemon_fini(this); + GF_ASSERT(list_empty(&priv->saved_locks)); LOCK(&priv->lock); if (priv->timer != NULL) { diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index f86f019e637..28be839ad68 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -39,6 +39,8 @@ #define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify" #define AFR_TA_DOM_MODIFY "afr.ta.dom-modify" +#define AFR_LK_HEAL_DOM "afr.lock-heal.domain" + #define AFR_HALO_MAX_LATENCY 99999 #define PFLAG_PENDING (1 << 0) @@ -95,6 +97,16 @@ typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this); gf_fop_list[local->op], uuid_utoa(local->inode->gfid)); \ } while (0) +#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label) \ + do { \ + afr_fd_ctx_t *__fd_ctx = NULL; \ + __fd_ctx = afr_fd_ctx_get(__fd, __this); \ + if (__fd_ctx && __fd_ctx->is_fd_bad) { \ + __error = EBADF; \ + goto __label; \ + } \ + } while (0) + typedef enum { AFR_READ_POLICY_FIRST_UP, AFR_READ_POLICY_GFID_HASH, @@ -143,6 +155,19 @@ struct afr_nfsd { gf_boolean_t iamnfsd; }; +typedef struct _afr_lk_heal_info { + fd_t *fd; + int32_t cmd; + struct gf_flock flock; + dict_t *xdata_req; + unsigned char *locked_nodes; + struct list_head pos; + gf_lkowner_t lk_owner; + pid_t pid; + int32_t *child_up_event_gen; + int32_t *child_down_event_gen; +} afr_lk_heal_info_t; + typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ @@ -249,6 +274,10 @@ typedef struct _afr_private { gf_boolean_t esh_granular; gf_boolean_t consistent_io; gf_boolean_t data_self_heal; /* on/off */ + + /*For lock healing.*/ + struct list_head saved_locks; + struct list_head lk_healq; } afr_private_t; typedef enum { @@ -371,6 +400,10 @@ typedef struct { arrives, we continue to read off this subvol. */ int readdir_subvol; + /* lock-healing related members. */ + gf_boolean_t is_fd_bad; + afr_lk_heal_info_t *lk_heal_info; + } afr_fd_ctx_t; typedef enum { @@ -572,6 +605,11 @@ typedef struct _afr_local { struct gf_flock ret_flock; unsigned char *locked_nodes; int32_t cmd; + /*For lock healing only.*/ + unsigned char *dom_locked_nodes; + int32_t *dom_lock_op_ret; + int32_t *dom_lock_op_errno; + struct gf_flock *getlk_rsp; } lk; /* inode read */ @@ -1074,6 +1112,8 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd); if (__local && __local->is_read_txn) \ afr_pending_read_decrement(__this->private, \ __local->read_subvol); \ + if (__local && afr_is_lock_mode_mandatory(__local->xdata_req)) \ + afr_dom_lock_release(frame); \ frame->local = NULL; \ } \ \ @@ -1354,4 +1394,10 @@ afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, void afr_selfheal_childup(xlator_t *this, afr_private_t *priv); + +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata); + +void +afr_dom_lock_release(call_frame_t *frame); #endif /* __AFR_H__ */ -- cgit