From 66f560e0071db84d430f38b996364f6b8c4f0f6d Mon Sep 17 00:00:00 2001 From: Rajesh Joseph Date: Wed, 23 Apr 2014 18:36:34 +0530 Subject: glusterd/snapshot: Restore cleanup If restores fails for some reason then we should revert the restore operation. To do so we take the backup of vols folder before doing a restore and if the restore fails then we revert the changes done. Change-Id: I97f72aec3a34fc122bf137beb336e94db3a04dff BUG: 1061685 Signed-off-by: Rajesh Joseph Reviewed-on: http://review.gluster.org/7548 Reviewed-by: Santosh Pradhan Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/mgmt/glusterd/src/glusterd-snapshot.c | 453 ++++++++++++++++++++++++-- xlators/mgmt/glusterd/src/glusterd-store.c | 15 +- xlators/mgmt/glusterd/src/glusterd-store.h | 2 + xlators/mgmt/glusterd/src/glusterd-utils.c | 71 ++++ xlators/mgmt/glusterd/src/glusterd-utils.h | 3 + xlators/mgmt/glusterd/src/glusterd.h | 3 + 6 files changed, 518 insertions(+), 29 deletions(-) (limited to 'xlators/mgmt/glusterd/src') diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c index 73723422bc7..399401d187a 100644 --- a/xlators/mgmt/glusterd/src/glusterd-snapshot.c +++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c @@ -410,6 +410,101 @@ out: return ret; } +/* This function will take backup of the volume store + * of the to-be restored volume. This will help us to + * revert the operation if it fails. + * + * @param volinfo volinfo of the origin volume + * + * @return 0 on success and -1 on failure + */ +int +glusterd_snapshot_backup_vol (glusterd_volinfo_t *volinfo) +{ + char pathname[PATH_MAX] = {0,}; + int ret = -1; + int op_ret = 0; + char delete_path[PATH_MAX] = {0,}; + char trashdir[PATH_MAX] = {0,}; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (volinfo); + + GLUSTERD_GET_VOLUME_DIR (pathname, volinfo, priv); + + snprintf (delete_path, sizeof (delete_path), + "%s/"GLUSTERD_TRASH"/vols-%s.deleted", priv->workdir, + volinfo->volname); + + snprintf (trashdir, sizeof (trashdir), "%s/"GLUSTERD_TRASH, + priv->workdir); + + /* Create trash folder if it is not there */ + ret = mkdir (trashdir, 0777); + if (ret && errno != EEXIST) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create trash " + "directory, reason : %s", strerror (errno)); + ret = -1; + goto out; + } + + /* Move the origin volume volder to the backup location */ + ret = rename (pathname, delete_path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to rename snap " + "directory %s to %s", pathname, delete_path); + goto out; + } + + /* Re-create an empty origin volume folder so that restore can + * happen. */ + ret = mkdir (pathname, 0777); + if (ret && errno != EEXIST) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create origin " + "volume directory (%s), reason : %s", + pathname, strerror (errno)); + ret = -1; + goto out; + } + + ret = 0; +out: + /* Save the actual return value */ + op_ret = ret; + if (ret) { + /* Revert the changes in case of failure */ + ret = rmdir (pathname); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Failed to rmdir: %s,err: %s", + pathname, strerror (errno)); + } + + ret = rename (delete_path, pathname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to rename directory %s to %s", + delete_path, pathname); + } + + ret = rmdir (trashdir); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Failed to rmdir: %s, Reason: %s", + trashdir, strerror (errno)); + } + } + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", op_ret); + + return op_ret; +} + int32_t glusterd_copy_geo_rep_files (glusterd_volinfo_t *origin_vol, glusterd_volinfo_t *snap_vol, dict_t *rsp_dict) @@ -681,6 +776,15 @@ glusterd_snapshot_restore_prevalidate (dict_t *dict, char **op_errstr, ret = -1; goto out; } + + /* Take backup of the volinfo folder */ + ret = glusterd_snapshot_backup_vol (volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to backup " + "volume backend files for %s volume", + volinfo->volname); + goto out; + } } ret = 0; @@ -5660,6 +5764,286 @@ out: return ret; } +/* This function is called if snapshot restore operation + * is successful. It will cleanup the backup files created + * during the restore operation. + * + * @param rsp_dict Response dictionary + * @param volinfo volinfo of the volume which is being restored + * @param snap snap object + * + * @return 0 on success or -1 on failure + */ +int +glusterd_snapshot_restore_cleanup (dict_t *rsp_dict, + glusterd_volinfo_t *volinfo, + glusterd_snap_t *snap) +{ + int ret = -1; + char delete_path[PATH_MAX] = {0,}; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + + this = THIS; + GF_ASSERT (this); + priv = this->private; + + GF_ASSERT (rsp_dict); + GF_ASSERT (volinfo); + GF_ASSERT (snap); + + /* If the volinfo is already restored then we should delete + * the backend LVMs */ + if (!uuid_is_null (volinfo->restored_from_snap)) { + ret = glusterd_lvm_snapshot_remove (rsp_dict, volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to remove " + "LVM backend"); + goto out; + } + } + + snprintf (delete_path, sizeof (delete_path), + "%s/"GLUSTERD_TRASH"/vols-%s.deleted", priv->workdir, + volinfo->volname); + + /* Restore is successful therefore delete the original volume's + * volinfo. + */ + ret = glusterd_volinfo_delete (volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to delete volinfo"); + goto out; + } + + /* Now delete the snap entry. */ + ret = glusterd_snap_remove (rsp_dict, snap, _gf_false, _gf_true); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Failed to delete " + "snap %s", snap->snapname); + goto out; + } + + /* Delete the backup copy of volume folder */ + ret = glusterd_recursive_rmdir (delete_path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to remove " + "backup dir (%s)", delete_path); + goto out; + } + + ret = 0; +out: + return ret; +} + +/* This function is called when the snapshot restore operation failed + * for some reasons. In such case we revert the restore operation. + * + * @param volinfo volinfo of the origin volume + * @param restore_from_store Boolean variable which tells whether to + * restore the origin from store or not. + * + * @return 0 on success and -1 on failure + */ +int +glusterd_snapshot_revert_partial_restored_vol (glusterd_volinfo_t *volinfo, + gf_boolean_t restore_from_store) +{ + int ret = 0; + char pathname [PATH_MAX] = {0,}; + char trash_path[PATH_MAX] = {0,}; + glusterd_volinfo_t *reverted_vol = NULL; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (volinfo); + + GLUSTERD_GET_VOLUME_DIR (pathname, volinfo, priv); + + snprintf (trash_path, sizeof (trash_path), + "%s/"GLUSTERD_TRASH"/vols-%s.deleted", priv->workdir, + volinfo->volname); + + /* Since snapshot restore failed we cannot rely on the volume + * data stored under vols folder. Therefore delete the origin + * volume's backend folder.*/ + ret = glusterd_recursive_rmdir (pathname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to remove " + "%s directory", pathname); + goto out; + } + + /* Now move the backup copy of the vols to its original + * location.*/ + ret = rename (trash_path, pathname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to rename folder " + "from %s to %s", trash_path, pathname); + goto out; + } + + /* Skip the volinfo retrieval from the store if restore_from_store + * is not true. */ + if (!restore_from_store) { + ret = 0; + goto out; + } + + /* Retrieve the volume from the store */ + reverted_vol = glusterd_store_retrieve_volume (volinfo->volname, NULL); + if (NULL == reverted_vol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to load restored " + "%s volume", volinfo->volname); + goto out; + } + + /* Since we retrieved the volinfo from store now we don't + * want the older volinfo. Therefore delete the older volinfo */ + ret = glusterd_volinfo_delete (volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to delete volinfo"); + goto out; + } + + ret = 0; +out: + return ret; +} + +/* This function is called when glusterd is started and we need + * to revert a failed snapshot restore. + * + * @param snap snapshot object of the restored snap + * + * @return 0 on success and -1 on failure + */ +int +glusterd_snapshot_revert_restore_from_snap (glusterd_snap_t *snap) +{ + int ret = -1; + char volname [PATH_MAX] = {0,}; + glusterd_volinfo_t *snap_volinfo = NULL; + glusterd_volinfo_t *volinfo = NULL; + xlator_t *this = NULL; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (snap); + + /* TODO : As of now there is only one volume in snapshot. + * Change this when multiple volume snapshot is introduced + */ + snap_volinfo = list_entry (snap->volumes.next, glusterd_volinfo_t, + vol_list); + + strcpy (volname, snap_volinfo->parent_volname); + + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not get volinfo of " + "%s", snap_volinfo->parent_volname); + goto out; + } + + ret = glusterd_snapshot_revert_partial_restored_vol (volinfo, _gf_true); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to revert snapshot " + "restore operation for %s volume", volname); + goto out; + } +out: + return ret; +} + +/* This function is called from post-validation. Based on the op_ret + * it will take a decision on whether to revert the operation or + * perform cleanup. + * + * @param dict dictionary object + * @param op_ret return value of the restore operation + * @param op_errstr error string + * @param rsp_dict Response dictionary + * + * @return 0 on success and -1 on failure + */ +int +glusterd_snapshot_restore_postop (dict_t *dict, int32_t op_ret, + char **op_errstr, dict_t *rsp_dict) +{ + int ret = -1; + char *name = NULL; + char *volname = NULL; + glusterd_snap_t *snap = NULL; + glusterd_volinfo_t *volinfo = NULL; + xlator_t *this = NULL; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (rsp_dict); + + ret = dict_get_str (dict, "snapname", &name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "getting the snap " + "name failed (volume: %s)", volinfo->volname); + goto out; + } + + snap = glusterd_find_snap_by_name (name); + if (!snap) { + gf_log (this->name, GF_LOG_ERROR, "snap %s is not found", name); + ret = -1; + goto out; + } + + /* TODO: fix this when multiple volume support will come */ + ret = dict_get_str (dict, "volname1", &volname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get volume name"); + goto out; + } + + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Volume (%s) does not exist ", volname); + goto out; + } + + /* On success perform the cleanup operation */ + if (0 == op_ret) { + ret = glusterd_snapshot_restore_cleanup (rsp_dict, volinfo, + snap); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to perform " + "snapshot restore cleanup for %s volume", + volname); + goto out; + } + } else { /* On failure revert snapshot restore */ + ret = glusterd_snapshot_revert_partial_restored_vol (volinfo, + _gf_false); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to revert " + "restore operation for %s volume", volname); + goto out; + } + } + + ret = 0; +out: + return ret; +} + int glusterd_snapshot_postvalidate (dict_t *dict, int32_t op_ret, char **op_errstr, dict_t *rsp_dict) @@ -5693,6 +6077,15 @@ glusterd_snapshot_postvalidate (dict_t *dict, int32_t op_ret, char **op_errstr, } break; case GF_SNAP_OPTION_TYPE_DELETE: + ret = glusterd_snapshot_update_snaps_post_validate (dict, + op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "update missed snaps list"); + goto out; + } + break; case GF_SNAP_OPTION_TYPE_RESTORE: ret = glusterd_snapshot_update_snaps_post_validate (dict, op_errstr, @@ -5702,6 +6095,14 @@ glusterd_snapshot_postvalidate (dict_t *dict, int32_t op_ret, char **op_errstr, "update missed snaps list"); goto out; } + + ret = glusterd_snapshot_restore_postop (dict, op_ret, + op_errstr, rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "perform snapshot restore post-op"); + goto out; + } break; case GF_SNAP_OPTION_TYPE_ACTIVATE: case GF_SNAP_OPTION_TYPE_DEACTIVATE: @@ -6274,6 +6675,23 @@ gd_restore_snap_volume (dict_t *rsp_dict, snap = snap_vol->snapshot; GF_VALIDATE_OR_GOTO (this->name, snap, out); + /* Set the status to under restore so that if the + * the node goes down during restore and comes back + * the state of the volume can be reverted correctly + */ + snap->snap_status = GD_SNAP_STATUS_UNDER_RESTORE; + + /* We need to save this in disk so that if node goes + * down the status is in updated state. + */ + ret = glusterd_store_snap (snap); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not store snap " + "object for %s snap of %s volume", snap_vol->volname, + snap_vol->parent_volname); + goto out; + } + /* Snap volume must be stoped before performing the * restore operation. */ @@ -6312,15 +6730,6 @@ gd_restore_snap_volume (dict_t *rsp_dict, ret = glusterd_snap_volinfo_restore (rsp_dict, new_volinfo, snap_vol); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Failed to restore snap"); - (void)glusterd_volinfo_delete (new_volinfo); - goto out; - } - - ret = glusterd_lvm_snapshot_remove (rsp_dict, orig_vol); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to remove " - "LVM backend"); - (void)glusterd_volinfo_delete (new_volinfo); goto out; } @@ -6344,27 +6753,8 @@ gd_restore_snap_volume (dict_t *rsp_dict, * set the status to the original volume's status. */ glusterd_set_volume_status (new_volinfo, orig_vol->status); - /* Once the new_volinfo is completely constructed then delete - * the orinal volinfo - */ - ret = glusterd_volinfo_delete (orig_vol); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to delete volinfo"); - (void)glusterd_volinfo_delete (new_volinfo); - goto out; - } - list_add_tail (&new_volinfo->vol_list, &conf->volumes); - /* Now delete the snap entry. As a first step delete the snap - * volume information stored in store. */ - ret = glusterd_snap_remove (rsp_dict, snap, _gf_false, _gf_true); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "Failed to delete " - "snap %s", snap->snapname); - goto out; - } - ret = glusterd_store_volinfo (new_volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); if (ret) { @@ -6374,6 +6764,13 @@ gd_restore_snap_volume (dict_t *rsp_dict, ret = 0; out: + if (ret && NULL != new_volinfo) { + /* In case of any failure we should free new_volinfo. Doing + * this will also remove the entry we added in conf->volumes + * if it was added there. + */ + (void)glusterd_volinfo_delete (new_volinfo); + } return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index 3993504e8b0..ab635ff943f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -3871,6 +3871,11 @@ out: * dies after taking the backend snapshot, but before updating the * status, then when glusterd comes up, it should treat that snapshot * as a failed snapshot and clean it up. + * + * Restore operation starts by setting the status to + * GD_SNAP_STATUS_RESTORED. If the server goes down before changing + * the status the status back we need to revert the partial snapshot + * taken. */ int32_t glusterd_snap_cleanup (xlator_t *this) @@ -3893,7 +3898,15 @@ glusterd_snap_cleanup (xlator_t *this) } list_for_each_entry (snap, &priv->snapshots, snap_list) { - if (snap->snap_status != GD_SNAP_STATUS_IN_USE) { + if (snap->snap_status == GD_SNAP_STATUS_RESTORED) { + ret = glusterd_snapshot_revert_restore_from_snap (snap); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Failed to " + "revert partially restored snapshot " + "(%s)", snap->snapname); + goto out; + } + } else if (snap->snap_status != GD_SNAP_STATUS_IN_USE) { ret = glusterd_snap_remove (dict, snap, _gf_true, _gf_true); if (ret) { diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index 63d510cbf17..7fc643ebe8d 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -170,4 +170,6 @@ glusterd_store_snap (glusterd_snap_t *snap); int32_t glusterd_store_update_missed_snaps (); +glusterd_volinfo_t* +glusterd_store_retrieve_volume (char *volname, glusterd_snap_t *snap); #endif diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 65aa5e1bf7d..b7f81bf83e5 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -11986,3 +11986,74 @@ out: return ret; } + +/* This is an utility function which will recursively delete + * a folder and its contents. + * + * @param delete_path folder to be deleted. + * + * @return 0 on success and -1 on failure. + */ +int +glusterd_recursive_rmdir (const char *delete_path) +{ + int ret = -1; + char path [PATH_MAX] = {0,}; + struct stat st = {0,}; + DIR *dir = NULL; + struct dirent *entry = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_VALIDATE_OR_GOTO (this->name, delete_path, out); + + dir = opendir (delete_path); + if (!dir) { + gf_log (this->name, GF_LOG_DEBUG, "Failed to open directory %s." + " Reason : %s", delete_path, strerror (errno)); + ret = 0; + goto out; + } + + glusterd_for_each_entry (entry, dir); + while (entry) { + snprintf (path, PATH_MAX, "%s/%s", delete_path, entry->d_name); + ret = stat (path, &st); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "Failed to stat " + "entry %s : %s", path, strerror (errno)); + goto out; + } + + if (S_ISDIR (st.st_mode)) + ret = glusterd_recursive_rmdir (path); + else + ret = unlink (path); + + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, " Failed to remove " + "%s. Reason : %s", path, strerror (errno)); + } + + gf_log (this->name, GF_LOG_DEBUG, "%s %s", + ret ? "Failed to remove":"Removed", + entry->d_name); + + glusterd_for_each_entry (entry, dir); + } + + ret = closedir (dir); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "Failed to close dir %s. " + "Reason : %s", delete_path, strerror (errno)); + } + + ret = rmdir (delete_path); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "Failed to rmdir: %s,err: %s", + delete_path, strerror (errno)); + } +out: + return ret; +} diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index 9c0c861830e..e4d41af64c0 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -777,4 +777,7 @@ glusterd_restore_geo_rep_files (glusterd_volinfo_t *snap_vol); int32_t glusterd_copy_quota_files (glusterd_volinfo_t *src_vol, glusterd_volinfo_t *dest_vol); + +int +glusterd_recursive_rmdir (const char *delete_path); #endif diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 2496a4d1182..b7c0aeafb1e 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -378,6 +378,7 @@ typedef enum gd_snap_status_ { GD_SNAP_STATUS_INIT, GD_SNAP_STATUS_IN_USE, GD_SNAP_STATUS_DECOMMISSION, + GD_SNAP_STATUS_UNDER_RESTORE, GD_SNAP_STATUS_RESTORED, } gd_snap_status_t; @@ -1005,4 +1006,6 @@ glusterd_add_new_entry_to_list (char *missed_info, char *snap_vol_id, int32_t brick_num, char *brick_path, int32_t snap_op, int32_t snap_status); +int +glusterd_snapshot_revert_restore_from_snap (glusterd_snap_t *snap); #endif -- cgit