diff options
Diffstat (limited to 'extras/snap_scheduler')
| -rw-r--r-- | extras/snap_scheduler/Makefile.am | 9 | ||||
| -rw-r--r-- | extras/snap_scheduler/README.md | 125 | ||||
| -rw-r--r-- | extras/snap_scheduler/conf.py.in | 11 | ||||
| -rwxr-xr-x | extras/snap_scheduler/gcron.py | 190 | ||||
| -rwxr-xr-x | extras/snap_scheduler/snap_scheduler.py | 941 |
5 files changed, 1276 insertions, 0 deletions
diff --git a/extras/snap_scheduler/Makefile.am b/extras/snap_scheduler/Makefile.am new file mode 100644 index 00000000000..782f139016f --- /dev/null +++ b/extras/snap_scheduler/Makefile.am @@ -0,0 +1,9 @@ +snap_schedulerdir = $(sbindir)/ + +if WITH_SERVER +snap_scheduler_SCRIPTS = gcron.py snap_scheduler.py conf.py +endif + +EXTRA_DIST = gcron.py snap_scheduler.py conf.py + +CLEANFILES = diff --git a/extras/snap_scheduler/README.md b/extras/snap_scheduler/README.md new file mode 100644 index 00000000000..1316bb76469 --- /dev/null +++ b/extras/snap_scheduler/README.md @@ -0,0 +1,125 @@ +Snapshot Scheduler +============================== + +SUMMARY +------- + +GlusterFS volume snapshot provides point-in-time copy of a GlusterFS volume. Currently, GlusterFS volume snapshots can be easily scheduled by setting up cron jobs on one of the nodes in the GlusterFS trusted storage pool. This has a single point failure (SPOF), as scheduled jobs can be missed if the node running the cron jobs dies. + +We can avoid the SPOF by distributing the cron jobs to all nodes of the trusted storage pool. + +DETAILED DESCRIPTION +-------------------- + +The solution to the above problems involves the usage of: + +* A shared storage - This can be any shared storage (another gluster volume, a NFS mount, etc.) that will be used to share the schedule configuration and will help in the coordination of the jobs. +* An agent - This agent will perform the actual snapshot commands, instead of cron. It will contain the logic to perform coordinated snapshots. +* A helper script - This script will allow the user to initialise the scheduler on the local node, enable/disable scheduling, add/edit/list/delete snapshot schedules. +* cronie - It is the default cron daemon shipped with RHEL. It invokes the agent at the appropriate intervals as mentioned by the user to perform the snapshot operation on the volume as mentioned by the user in the schedule. + +INITIAL SETUP +------------- + +The administrator needs to create a shared storage that can be available to nodes across the cluster. A GlusterFS volume can also be used for the same. It is preferable that the *shared volume* be a replicate volume to avoid SPOF. + +Once the shared storage is created, it should be mounted on all nodes in the trusted storage pool which will be participating in the scheduling. The location where the shared_storage should be mounted (/var/run/gluster/snaps/shared_storage) in these nodes is fixed and is not configurable. Each node participating in the scheduling then needs to perform an initialisation of the snapshot scheduler by invoking the following: + +snap_scheduler.py init + +NOTE: This command needs to be run on all the nodes participating in the scheduling + +HELPER SCRIPT +------------- + +The helper script(snap_scheduler.py) will initialise the scheduler on the local node, enable/disable scheduling, add/edit/list/delete snapshot schedules. + +a) snap_scheduler.py init + +This command initialises the snap_scheduler and interfaces it with the crond running on the local node. This is the first step, before executing any scheduling related commands from a node. + +NOTE: The helper script needs to be run with this option on all the nodes participating in the scheduling. Other options of the helper script can be run independently from any node, where initialisation has been successfully completed. + +b) snap_scheduler.py enable + +The snap scheduler is disabled by default after initialisation. This command enables the snap scheduler. + +c) snap_scheduler.py disable + +This command disables the snap scheduler. + +d) snap_scheduler.py status + +This command displays the current status(Enabled/Disabled) of the snap scheduler. + +e) snap_scheduler.py add "Job Name" "Schedule" "Volume Name" + +This command adds a new snapshot schedule. All the arguments must be provided within double-quotes(""). It takes three arguments: + +-> Job Name: This name uniquely identifies this particular schedule, and can be used to reference this schedule for future events like edit/delete. If a schedule already exists for the specified Job Name, the add command will fail. + +-> Schedule: The schedules are accepted in the format crond understands:- + +Example of job definition: +.---------------- minute (0 - 59) +| .------------- hour (0 - 23) +| | .---------- day of month (1 - 31) +| | | .------- month (1 - 12) OR jan,feb,mar,apr ... +| | | | .---- day of week (0 - 6) (Sunday=0 or 7) OR sun,mon,tue,wed,thu,fri,sat +| | | | | +* * * * * user-name command to be executed +Although we accept all valid cron schedules, currently we support granularity of snapshot schedules to a maximum of half-hourly snapshots. + +-> Volume Name: The name of the volume on which the scheduled snapshot operation will be performed. + +f) snap_scheduler.py edit "Job Name" "Schedule" "Volume Name" + +This command edits an existing snapshot schedule. It takes the same three arguments that the add option takes. All the arguments must be provided within double-quotes(""). If a schedule does not exists for the specified Job Name, the edit command will fail. + +g) snap_scheduler.py delete "Job Name" + +This command deletes an existing snapshot schedule. It takes the job name of the schedule as argument. The argument must be provided within double-quotes(""). If a schedule does not exists for the specified Job Name, the delete command will fail. + +h) snap_scheduler.py list + +This command lists the existing snapshot schedules in the following manner: Pseudocode: + +# snap_scheduler.py list +JOB_NAME SCHEDULE OPERATION VOLUME NAME +-------------------------------------------------------------------- +Job0 * * * * * Snapshot Create test_vol + +THE AGENT +--------- + +The snapshots scheduled with the help of the helper script, are read by crond which then invokes the agent(gcron.py) at the scheduled intervals to perform the snapshot operations on the specified volumes. It then performs the scheduled snapshots using the following algorithm to coordinate. + +start_time = get current time +lock_file = job_name passed as an argument +vol_name = volume name psased as an argument +try POSIX locking the $lock_file + if lock is obtained, then + mod_time = Get modification time of $entry + if $mod_time < $start_time, then + Take snapshot of $entry.name (Volume name) + if snapshot failed, then + log the failure + Update modification time of $entry to current time + unlock the $entry + +The coordination with other scripts running on other nodes, is handled by the use of POSIX locks. All the instances of the script will attempt to lock the lock_file which is essentialy an empty file with the job name, and one which gets the lock will take the snapshot. + +To prevent redoing a done task, the script will make use of the mtime attribute of the entry. At the beginning execution, the script would have saved its start time. Once the script obtains the lock on the lock_file, before taking the snapshot, it compares the mtime of the entry with the start time. The snapshot will only be taken if the mtime is smaller than start time. Once the snapshot command completes, the script will update the mtime of the lock_file to the current time before unlocking. + +If a snapshot command fails, the script will log the failure (in syslog) and continue with its operation. It will not attempt to retry the failed snapshot in the current schedule, but will attempt it again in the next schedules. It is left to the administrator to monitor the logs and decide what to do after a failure. + +ASSUMPTIONS AND LIMITATIONS +--------------------------- + +It is assumed that all nodes in the have their times synced using NTP or any other mechanism. This is a hard requirement for this feature to work. + +The administrator needs to have python2.7 or higher installed, as well as the argparse module installed, to be able to use the helper script(snap_scheduler.py). + +There is a latency of one minute, between providing a command by the helper script and that command taking effect. Hence, currently we do not support snapshot schedules with per minute granularity. + +The administrator can however leverage the scheduler to schedule snapshots with granularity of half-hourly/hourly/daily/weekly/monthly/yearly periodic intervals. They can also schedule snapshots, which are customised mentioning which minute of the hour, which day of the week, which week of the month, and which month of the year, they want to schedule the snapshot operation. diff --git a/extras/snap_scheduler/conf.py.in b/extras/snap_scheduler/conf.py.in new file mode 100644 index 00000000000..6dcca0534a7 --- /dev/null +++ b/extras/snap_scheduler/conf.py.in @@ -0,0 +1,11 @@ +# +# Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> +# This file is part of GlusterFS. + +# This file is licensed to you under your choice of the GNU Lesser +# General Public License, version 3 or any later version (LGPLv3 or +# later), or the GNU General Public License, version 2 (GPLv2), in all +# cases as published by the Free Software Foundation. +# + +GLUSTERFS_LIBEXECDIR = '@GLUSTERFS_LIBEXECDIR@' diff --git a/extras/snap_scheduler/gcron.py b/extras/snap_scheduler/gcron.py new file mode 100755 index 00000000000..0e4df77d481 --- /dev/null +++ b/extras/snap_scheduler/gcron.py @@ -0,0 +1,190 @@ +#!/usr/bin/python3 +# +# Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +# This file is part of GlusterFS. +# +# This file is licensed to you under your choice of the GNU Lesser +# General Public License, version 3 or any later version (LGPLv3 or +# later), or the GNU General Public License, version 2 (GPLv2), in all +# cases as published by the Free Software Foundation. + +from __future__ import print_function +import subprocess +import os +import os.path +import sys +import time +import logging +import logging.handlers +import fcntl + + +GCRON_TASKS = "/run/gluster/shared_storage/snaps/glusterfs_snap_cron_tasks" +GCRON_CROND_TASK = "/etc/cron.d/glusterfs_snap_cron_tasks" +GCRON_RELOAD_FLAG = "/var/run/gluster/crond_task_reload_flag" +LOCK_FILE_DIR = "/run/gluster/shared_storage/snaps/lock_files/" +log = logging.getLogger("gcron-logger") +start_time = 0.0 + + +def initLogger(script_name): + log.setLevel(logging.DEBUG) + logFormat = "[%(asctime)s %(filename)s:%(lineno)s %(funcName)s] "\ + "%(levelname)s %(message)s" + formatter = logging.Formatter(logFormat) + + sh = logging.handlers.SysLogHandler() + sh.setLevel(logging.ERROR) + sh.setFormatter(formatter) + + process = subprocess.Popen(["gluster", "--print-logdir"], + stdout=subprocess.PIPE, + universal_newlines=True) + out, err = process.communicate() + if process.returncode == 0: + logfile = os.path.join(out.strip(), script_name[:-3]+".log") + + fh = logging.FileHandler(logfile) + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + + log.addHandler(sh) + log.addHandler(fh) + + +def takeSnap(volname="", snapname=""): + success = True + if volname == "": + log.debug("No volname given") + return False + if snapname == "": + log.debug("No snapname given") + return False + + cli = ["gluster", + "snapshot", + "create", + snapname, + volname] + log.debug("Running command '%s'", " ".join(cli)) + + p = subprocess.Popen(cli, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = p.communicate() + rv = p.returncode + + log.debug("Command '%s' returned '%d'", " ".join(cli), rv) + + if rv: + log.error("Snapshot of %s failed", volname) + log.error("Command output:") + log.error(err) + success = False + else: + log.info("Snapshot of %s successful", volname) + + return success + + +def doJob(name, lockFile, jobFunc, volname): + success = True + try: + f = os.open(lockFile, os.O_CREAT | os.O_RDWR | os.O_NONBLOCK) + try: + fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) + mtime = os.path.getmtime(lockFile) + global start_time + log.debug("%s last modified at %s", lockFile, time.ctime(mtime)) + if mtime < start_time: + log.debug("Processing job %s", name) + if jobFunc(volname, name): + log.info("Job %s succeeded", name) + else: + log.error("Job %s failed", name) + success = False + os.utime(lockFile, None) + else: + log.info("Job %s has been processed already", name) + fcntl.flock(f, fcntl.LOCK_UN) + except (OSError, IOError): + log.info("Job %s is being processed by another agent", name) + os.close(f) + except (OSError, IOError) as e: + log.debug("Failed to open lock file %s : %s", lockFile, e) + log.error("Failed to process job %s", name) + success = False + + return success + + +def main(): + script_name = os.path.basename(__file__) + initLogger(script_name) + global start_time + if sys.argv[1] == "--update": + if not os.path.exists(GCRON_TASKS): + # Create a flag in /var/run/gluster which indicates that this + # node doesn't have access to GCRON_TASKS right now, so that + # when the mount is available and GCRON_TASKS is available + # the flag will tell this routine to reload GCRON_CROND_TASK + try: + f = os.open(GCRON_RELOAD_FLAG, + os.O_CREAT | os.O_NONBLOCK, 0o644) + os.close(f) + except OSError as e: + if errno != EEXIST: + log.error("Failed to create %s : %s", + GCRON_RELOAD_FLAG, e) + output("Failed to create %s. Error: %s" + % (GCRON_RELOAD_FLAG, e)) + return + + if not os.path.exists(GCRON_CROND_TASK): + return + + # As GCRON_TASKS exists now, we should check if GCRON_RELOAD_FLAG + # also exists. If so we should touch GCRON_CROND_TASK and remove + # the GCRON_RELOAD_FLAG + if os.path.exists(GCRON_RELOAD_FLAG): + try: + os.remove(GCRON_RELOAD_FLAG); + process = subprocess.Popen(["touch", "-h", GCRON_CROND_TASK], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = process.communicate() + if process.returncode != 0: + log.error("Failed to touch %s. Error: %s.", + GCRON_CROND_TASK, err) + except (IOError, OSError) as e: + log.error("Failed to touch %s. Error: %s.", + GCRON_CROND_TASK, e) + return + if os.lstat(GCRON_TASKS).st_mtime > \ + os.lstat(GCRON_CROND_TASK).st_mtime: + try: + process = subprocess.Popen(["touch", "-h", GCRON_CROND_TASK], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = process.communicate() + if process.returncode != 0: + log.error("Failed to touch %s. Error: %s.", + GCRON_CROND_TASK, err) + except IOError as e: + log.error("Failed to touch %s. Error: %s.", + GCRON_CROND_TASK, e) + return + + volname = sys.argv[1] + jobname = sys.argv[2] + locking_file = os.path.join(LOCK_FILE_DIR, jobname) + log.debug("locking_file = %s", locking_file) + log.debug("volname = %s", volname) + log.debug("jobname = %s", jobname) + + start_time = int(time.time()) + + doJob("Scheduled-" + jobname + "-" + volname, locking_file, takeSnap, volname) + + +if __name__ == "__main__": + main() diff --git a/extras/snap_scheduler/snap_scheduler.py b/extras/snap_scheduler/snap_scheduler.py new file mode 100755 index 00000000000..e8fcc449a9b --- /dev/null +++ b/extras/snap_scheduler/snap_scheduler.py @@ -0,0 +1,941 @@ +#!/usr/bin/python3 +# +# Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +# This file is part of GlusterFS. +# +# This file is licensed to you under your choice of the GNU Lesser +# General Public License, version 3 or any later version (LGPLv3 or +# later), or the GNU General Public License, version 2 (GPLv2), in all +# cases as published by the Free Software Foundation. + +from __future__ import print_function +import subprocess +import os +import os.path +import logging +import argparse +import fcntl +import logging.handlers +import sys +import shutil +from errno import EEXIST +from conf import GLUSTERFS_LIBEXECDIR +sys.path.insert(1, GLUSTERFS_LIBEXECDIR) + +EVENTS_ENABLED = True +try: + from events.eventtypes import SNAPSHOT_SCHEDULER_INITIALISED \ + as EVENT_SNAPSHOT_SCHEDULER_INITIALISED + from events.eventtypes import SNAPSHOT_SCHEDULER_INIT_FAILED \ + as EVENT_SNAPSHOT_SCHEDULER_INIT_FAILED + from events.eventtypes import SNAPSHOT_SCHEDULER_DISABLED \ + as EVENT_SNAPSHOT_SCHEDULER_DISABLED + from events.eventtypes import SNAPSHOT_SCHEDULER_DISABLE_FAILED \ + as EVENT_SNAPSHOT_SCHEDULER_DISABLE_FAILED + from events.eventtypes import SNAPSHOT_SCHEDULER_ENABLED \ + as EVENT_SNAPSHOT_SCHEDULER_ENABLED + from events.eventtypes import SNAPSHOT_SCHEDULER_ENABLE_FAILED \ + as EVENT_SNAPSHOT_SCHEDULER_ENABLE_FAILED + from events.eventtypes import SNAPSHOT_SCHEDULER_SCHEDULE_ADDED \ + as EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_ADDED + from events.eventtypes import SNAPSHOT_SCHEDULER_SCHEDULE_ADD_FAILED \ + as EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_ADD_FAILED + from events.eventtypes import SNAPSHOT_SCHEDULER_SCHEDULE_DELETED \ + as EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_DELETED + from events.eventtypes import SNAPSHOT_SCHEDULER_SCHEDULE_DELETE_FAILED \ + as EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_DELETE_FAILED + from events.eventtypes import SNAPSHOT_SCHEDULER_SCHEDULE_EDITED \ + as EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_EDITED + from events.eventtypes import SNAPSHOT_SCHEDULER_SCHEDULE_EDIT_FAILED \ + as EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_EDIT_FAILED +except ImportError: + # Events APIs not installed, dummy eventtypes with None + EVENTS_ENABLED = False + EVENT_SNAPSHOT_SCHEDULER_INITIALISED = None + EVENT_SNAPSHOT_SCHEDULER_INIT_FAILED = None + EVENT_SNAPSHOT_SCHEDULER_DISABLED = None + EVENT_SNAPSHOT_SCHEDULER_DISABLE_FAILED = None + EVENT_SNAPSHOT_SCHEDULER_ENABLED = None + EVENT_SNAPSHOT_SCHEDULER_ENABLE_FAILED = None + EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_ADDED = None + EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_ADD_FAILED = None + EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_DELETED = None + EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_DELETE_FAILED = None + EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_EDITED = None + EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_EDIT_FAILED = None + +SCRIPT_NAME = "snap_scheduler" +scheduler_enabled = False +log = logging.getLogger(SCRIPT_NAME) +SHARED_STORAGE_DIR="/run/gluster/shared_storage" +GCRON_DISABLED = SHARED_STORAGE_DIR+"/snaps/gcron_disabled" +GCRON_ENABLED = SHARED_STORAGE_DIR+"/snaps/gcron_enabled" +GCRON_TASKS = SHARED_STORAGE_DIR+"/snaps/glusterfs_snap_cron_tasks" +GCRON_CROND_TASK = "/etc/cron.d/glusterfs_snap_cron_tasks" +LOCK_FILE_DIR = SHARED_STORAGE_DIR+"/snaps/lock_files/" +LOCK_FILE = LOCK_FILE_DIR+"lock_file" +TMP_FILE = SHARED_STORAGE_DIR+"/snaps/tmp_file" +GCRON_UPDATE_TASK = "/etc/cron.d/gcron_update_task" +CURRENT_SCHEDULER = SHARED_STORAGE_DIR+"/snaps/current_scheduler" +tasks = {} +longest_field = 12 +current_scheduler = "" + +INTERNAL_ERROR = 2 +SHARED_STORAGE_DIR_DOESNT_EXIST = 3 +SHARED_STORAGE_NOT_MOUNTED = 4 +ANOTHER_TRANSACTION_IN_PROGRESS = 5 +INIT_FAILED = 6 +SCHEDULING_ALREADY_DISABLED = 7 +SCHEDULING_ALREADY_ENABLED = 8 +NODE_NOT_INITIALISED = 9 +ANOTHER_SCHEDULER_ACTIVE = 10 +JOB_ALREADY_EXISTS = 11 +JOB_NOT_FOUND = 12 +INVALID_JOBNAME = 13 +INVALID_VOLNAME = 14 +INVALID_SCHEDULE = 15 +INVALID_ARG = 16 +VOLUME_DOES_NOT_EXIST = 17 + +def print_error (error_num): + if error_num == INTERNAL_ERROR: + return "Internal Error" + elif error_num == SHARED_STORAGE_DIR_DOESNT_EXIST: + return "The shared storage directory ("+SHARED_STORAGE_DIR+")" \ + " does not exist." + elif error_num == SHARED_STORAGE_NOT_MOUNTED: + return "The shared storage directory ("+SHARED_STORAGE_DIR+")" \ + " is not mounted." + elif error_num == ANOTHER_TRANSACTION_IN_PROGRESS: + return "Another transaction is in progress." + elif error_num == INIT_FAILED: + return "Initialisation failed." + elif error_num == SCHEDULING_ALREADY_DISABLED: + return "Snapshot scheduler is already disabled." + elif error_num == SCHEDULING_ALREADY_ENABLED: + return "Snapshot scheduler is already enabled." + elif error_num == NODE_NOT_INITIALISED: + return "The node is not initialised." + elif error_num == ANOTHER_SCHEDULER_ACTIVE: + return "Another scheduler is active." + elif error_num == JOB_ALREADY_EXISTS: + return "The job already exists." + elif error_num == JOB_NOT_FOUND: + return "The job cannot be found." + elif error_num == INVALID_JOBNAME: + return "The job name is invalid." + elif error_num == INVALID_VOLNAME: + return "The volume name is invalid." + elif error_num == INVALID_SCHEDULE: + return "The schedule is invalid." + elif error_num == INVALID_ARG: + return "The argument is invalid." + elif error_num == VOLUME_DOES_NOT_EXIST: + return "The volume does not exist." + +def output(msg): + print("%s: %s" % (SCRIPT_NAME, msg)) + + +def initLogger(): + log.setLevel(logging.DEBUG) + logFormat = "[%(asctime)s %(filename)s:%(lineno)s %(funcName)s] "\ + "%(levelname)s %(message)s" + formatter = logging.Formatter(logFormat) + + sh = logging.handlers.SysLogHandler() + sh.setLevel(logging.ERROR) + sh.setFormatter(formatter) + + process = subprocess.Popen(["gluster", "--print-logdir"], + stdout=subprocess.PIPE, universal_newlines=True) + logfile = os.path.join(process.stdout.read()[:-1], SCRIPT_NAME + ".log") + + fh = logging.FileHandler(logfile) + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + + log.addHandler(sh) + log.addHandler(fh) + + +def scheduler_status(): + ret = INTERNAL_ERROR + global scheduler_enabled + try: + f = os.path.realpath(GCRON_TASKS) + if f != os.path.realpath(GCRON_ENABLED) or not os.path.exists(GCRON_ENABLED): + log.info("Snapshot scheduler is currently disabled.") + scheduler_enabled = False + else: + log.info("Snapshot scheduler is currently enabled.") + scheduler_enabled = True + ret = 0 + except: + log.error("Failed to enable snapshot scheduling. Error: " + "Failed to check the status of %s.", GCRON_DISABLED) + + return ret + +def enable_scheduler(): + ret = scheduler_status() + if ret == 0: + if not scheduler_enabled: + + # Check if another scheduler is active. + ret = get_current_scheduler() + if ret == 0: + if (current_scheduler != "none"): + print_str = "Failed to enable snapshot scheduling. " \ + "Error: Another scheduler is active." + log.error(print_str) + output(print_str) + ret = ANOTHER_SCHEDULER_ACTIVE + return ret + else: + print_str = "Failed to get current scheduler info." + log.error(print_str) + output(print_str) + return ret + + log.info("Enabling snapshot scheduler.") + try: + if os.path.exists(GCRON_DISABLED): + os.remove(GCRON_DISABLED) + if os.path.lexists(GCRON_TASKS): + os.remove(GCRON_TASKS) + try: + f = os.open(GCRON_ENABLED, os.O_CREAT | os.O_NONBLOCK, + 0o644) + os.close(f) + except OSError as e: + log.error("Failed to open %s. Error: %s.", + GCRON_ENABLED, e) + ret = INTERNAL_ERROR + return ret + os.symlink(GCRON_ENABLED, GCRON_TASKS) + update_current_scheduler("cli") + log.info("Snapshot scheduling is enabled") + output("Snapshot scheduling is enabled") + ret = 0 + except OSError as e: + print_str = ("Failed to enable snapshot scheduling." + "Error: {{}}" + e) + log.error(print_str) + output(print_str) + ret = INTERNAL_ERROR + else: + print_str = "Failed to enable snapshot scheduling. " \ + "Error: Snapshot scheduling is already enabled." + log.error(print_str) + output(print_str) + ret = SCHEDULING_ALREADY_ENABLED + else: + print_str = "Failed to enable snapshot scheduling. " \ + "Error: Failed to check scheduler status." + log.error(print_str) + output(print_str) + + return ret + + +def disable_scheduler(): + ret = scheduler_status() + if ret == 0: + if scheduler_enabled: + log.info("Disabling snapshot scheduler.") + try: + # Check if another scheduler is active. If not, then + # update current scheduler to "none". Else do nothing. + ret = get_current_scheduler() + if ret == 0: + if (current_scheduler == "cli"): + update_current_scheduler("none") + else: + print_str = "Failed to disable snapshot scheduling. " \ + "Error: Failed to get current scheduler info." + log.error(print_str) + output(print_str) + return ret + + if os.path.exists(GCRON_DISABLED): + os.remove(GCRON_DISABLED) + if os.path.lexists(GCRON_TASKS): + os.remove(GCRON_TASKS) + f = os.open(GCRON_DISABLED, os.O_CREAT, 0o644) + os.close(f) + os.symlink(GCRON_DISABLED, GCRON_TASKS) + log.info("Snapshot scheduling is disabled") + output("Snapshot scheduling is disabled") + ret = 0 + except OSError as e: + print_str = ("Failed to disable snapshot scheduling. Error: " + + e) + log.error(print_str) + output(print_str) + ret = INTERNAL_ERROR + else: + print_str = "Failed to disable scheduling. " \ + "Error: Snapshot scheduling is already disabled." + log.error(print_str) + output(print_str) + ret = SCHEDULING_ALREADY_DISABLED + else: + print_str = "Failed to disable snapshot scheduling. " \ + "Error: Failed to check scheduler status." + log.error(print_str) + output(print_str) + ret = INTERNAL_ERROR + + return ret + + +def load_tasks_from_file(): + global tasks + global longest_field + try: + with open(GCRON_ENABLED, 'r') as f: + for line in f: + line = line.rstrip('\n') + if not line: + break + line = line.split("gcron.py") + schedule = line[0].split("root")[0].rstrip(' ') + line = line[1].split(" ") + volname = line[1] + jobname = line[2] + longest_field = max(longest_field, len(jobname), len(volname), + len(schedule)) + tasks[jobname] = schedule+":"+volname + f.close() + ret = 0 + except IOError as e: + log.error("Failed to open %s. Error: %s.", GCRON_ENABLED, e) + ret = INTERNAL_ERROR + + return ret + + +def get_current_scheduler(): + global current_scheduler + try: + with open(CURRENT_SCHEDULER, 'r') as f: + current_scheduler = f.readline().rstrip('\n') + f.close() + ret = 0 + except IOError as e: + log.error("Failed to open %s. Error: %s.", CURRENT_SCHEDULER, e) + ret = INTERNAL_ERROR + + return ret + + +def list_schedules(): + log.info("Listing snapshot schedules.") + ret = load_tasks_from_file() + if ret == 0: + if len(tasks) == 0: + output("No snapshots scheduled") + else: + jobname = "JOB_NAME".ljust(longest_field+5) + schedule = "SCHEDULE".ljust(longest_field+5) + operation = "OPERATION".ljust(longest_field+5) + volname = "VOLUME NAME".ljust(longest_field+5) + hyphens = "".ljust((longest_field+5) * 4, '-') + print(jobname+schedule+operation+volname) + print(hyphens) + for key in sorted(tasks): + jobname = key.ljust(longest_field+5) + schedule = tasks[key].split(":")[0].ljust( + longest_field + 5) + volname = tasks[key].split(":")[1].ljust( + longest_field + 5) + operation = "Snapshot Create".ljust(longest_field+5) + print(jobname+schedule+operation+volname) + ret = 0 + else: + print_str = "Failed to list snapshot schedules. " \ + "Error: Failed to load tasks from "+GCRON_ENABLED + log.error(print_str) + output(print_str) + + return ret + + +def write_tasks_to_file(): + try: + with open(TMP_FILE, "w", 0o644) as f: + # If tasks is empty, just create an empty tmp file + if len(tasks) != 0: + for key in sorted(tasks): + jobname = key + schedule = tasks[key].split(":")[0] + volname = tasks[key].split(":")[1] + f.write("%s root PATH=$PATH:/usr/local/sbin:/usr/sbin " + "gcron.py %s %s\n" % (schedule, volname, jobname)) + f.write("\n") + f.flush() + os.fsync(f.fileno()) + f.close() + except IOError as e: + log.error("Failed to open %s. Error: %s.", TMP_FILE, e) + ret = INTERNAL_ERROR + return ret + + shutil.move(TMP_FILE, GCRON_ENABLED) + ret = 0 + + return ret + +def update_current_scheduler(data): + try: + with open(TMP_FILE, "w", 0o644) as f: + f.write("%s" % data) + f.flush() + os.fsync(f.fileno()) + f.close() + except IOError as e: + log.error("Failed to open %s. Error: %s.", TMP_FILE, e) + ret = INTERNAL_ERROR + return ret + + shutil.move(TMP_FILE, CURRENT_SCHEDULER) + ret = 0 + + return ret + + +def isVolumePresent(volname): + success = False + if volname == "": + log.debug("No volname given") + return success + + cli = ["gluster", + "volume", + "info", + volname] + log.debug("Running command '%s'", " ".join(cli)) + + p = subprocess.Popen(cli, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = p.communicate() + rv = p.returncode + + log.debug("Command '%s' returned '%d'", " ".join(cli), rv) + + if rv: + log.error("Command output:") + log.error(err) + else: + success = True; + + return success + + +def add_schedules(jobname, schedule, volname): + log.info("Adding snapshot schedules.") + ret = load_tasks_from_file() + if ret == 0: + if jobname in tasks: + print_str = ("%s already exists in schedule. Use " + "'edit' to modify %s" % (jobname, jobname)) + log.error(print_str) + output(print_str) + ret = JOB_ALREADY_EXISTS + else: + if not isVolumePresent(volname): + print_str = ("Volume %s does not exist. Create %s and retry." % + (volname, volname)) + log.error(print_str) + output(print_str) + ret = VOLUME_DOES_NOT_EXIST + else: + tasks[jobname] = schedule + ":" + volname + ret = write_tasks_to_file() + if ret == 0: + # Create a LOCK_FILE for the job + job_lockfile = LOCK_FILE_DIR + jobname + try: + f = os.open(job_lockfile, os.O_CREAT | os.O_NONBLOCK, + 0o644) + os.close(f) + except OSError as e: + log.error("Failed to open %s. Error: %s.", + job_lockfile, e) + ret = INTERNAL_ERROR + return ret + log.info("Successfully added snapshot schedule %s" % + jobname) + output("Successfully added snapshot schedule") + ret = 0 + else: + print_str = "Failed to add snapshot schedule. " \ + "Error: Failed to load tasks from "+GCRON_ENABLED + log.error(print_str) + output(print_str) + + return ret + + +def delete_schedules(jobname): + log.info("Delete snapshot schedules.") + ret = load_tasks_from_file() + if ret == 0: + if jobname in tasks: + del tasks[jobname] + ret = write_tasks_to_file() + if ret == 0: + # Delete the LOCK_FILE for the job + job_lockfile = LOCK_FILE_DIR+jobname + try: + os.remove(job_lockfile) + except OSError as e: + log.error("Failed to open %s. Error: %s.", + job_lockfile, e) + ret = INTERNAL_ERROR + return ret + log.info("Successfully deleted snapshot schedule %s" + % jobname) + output("Successfully deleted snapshot schedule") + ret = 0 + else: + print_str = ("Failed to delete %s. Error: No such " + "job scheduled" % jobname) + log.error(print_str) + output(print_str) + ret = JOB_NOT_FOUND + else: + print_str = "Failed to delete snapshot schedule. " \ + "Error: Failed to load tasks from "+GCRON_ENABLED + log.error(print_str) + output(print_str) + + return ret + + +def edit_schedules(jobname, schedule, volname): + log.info("Editing snapshot schedules.") + ret = load_tasks_from_file() + if ret == 0: + if jobname in tasks: + if not isVolumePresent(volname): + print_str = ("Volume %s does not exist. Create %s and retry." % + (volname, volname)) + log.error(print_str) + output(print_str) + ret = VOLUME_DOES_NOT_EXIST + else: + tasks[jobname] = schedule+":"+volname + ret = write_tasks_to_file() + if ret == 0: + log.info("Successfully edited snapshot schedule %s" % + jobname) + output("Successfully edited snapshot schedule") + else: + print_str = ("Failed to edit %s. Error: No such " + "job scheduled" % jobname) + log.error(print_str) + output(print_str) + ret = JOB_NOT_FOUND + else: + print_str = "Failed to edit snapshot schedule. " \ + "Error: Failed to load tasks from "+GCRON_ENABLED + log.error(print_str) + output(print_str) + + return ret + +def get_bool_val(): + getsebool_cli = ["getsebool", + "-a"] + p1 = subprocess.Popen(getsebool_cli, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + grep_cmd = ["grep", + "cron_system_cronjob_use_shares"] + p2 = subprocess.Popen(grep_cmd, stdin=p1.stdout, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + p1.stdout.close() + output, err = p2.communicate() + rv = p2.returncode + + if rv: + log.error("Command output:") + log.error(err) + return -1 + + bool_val = output.split()[2] + log.debug("Bool value = '%s'", bool_val) + + return bool_val + +def get_selinux_status(): + getenforce_cli = ["getenforce"] + log.debug("Running command '%s'", " ".join(getenforce_cli)) + + try: + p1 = subprocess.Popen(getenforce_cli, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + except OSError as oserr: + log.error("Failed to run the command \"getenforce\". Error: %s" %\ + oserr) + return -1 + + output, err = p1.communicate() + rv = p1.returncode + + if rv: + log.error("Command output:") + log.error(err) + return -1 + else: + selinux_status=output.rstrip() + log.debug("selinux status: %s", selinux_status) + + return selinux_status + +def set_cronjob_user_share(): + selinux_status = get_selinux_status() + if (selinux_status == -1): + log.error("Failed to get selinux status") + return -1 + elif (selinux_status == "Disabled"): + return 0 + + bool_val = get_bool_val() + # In case of a failure (where the boolean value is not) + # present in the system, we should not proceed further + # We should only proceed when the value is "off" + if (bool_val == -1 or bool_val != "off"): + return 0 + + setsebool_cli = ["setsebool", "-P", + "cron_system_cronjob_use_shares", + "on"] + log.debug("Running command '%s'", " ".join(setsebool_cli)) + + p1 = subprocess.Popen(setsebool_cli, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + output, err = p1.communicate() + rv = p1.returncode + + if rv: + log.error("Command output:") + log.error(err) + return rv + + bool_val = get_bool_val() + if (bool_val == "on"): + return 0 + else: + # In case of an error or if boolean is not on + # we return a failure here + return -1 + +def initialise_scheduler(): + ret = set_cronjob_user_share() + if ret: + log.error("Failed to set selinux boolean " + "cron_system_cronjob_use_shares to 'on'") + return ret + + try: + with open(TMP_FILE, "w+", 0o644) as f: + updater = ("* * * * * root PATH=$PATH:/usr/local/sbin:" + "/usr/sbin gcron.py --update\n") + f.write("%s\n" % updater) + f.flush() + os.fsync(f.fileno()) + f.close() + except IOError as e: + log.error("Failed to open %s. Error: %s.", TMP_FILE, e) + ret = INIT_FAILED + return ret + + shutil.move(TMP_FILE, GCRON_UPDATE_TASK) + + if not os.path.lexists(GCRON_TASKS): + try: + f = open(GCRON_TASKS, "w", 0o644) + f.close() + except IOError as e: + log.error("Failed to open %s. Error: %s.", GCRON_TASKS, e) + ret = INIT_FAILED + return ret + + if os.path.lexists(GCRON_CROND_TASK): + os.remove(GCRON_CROND_TASK) + + os.symlink(GCRON_TASKS, GCRON_CROND_TASK) + + log.info("Successfully initialised snapshot scheduler for this node") + output("Successfully initialised snapshot scheduler for this node") + gf_event (EVENT_SNAPSHOT_SCHEDULER_INITIALISED, status="Success") + + ret = 0 + return ret + + +def syntax_checker(args): + if hasattr(args, 'jobname'): + if (len(args.jobname.split()) != 1): + output("Invalid Jobname. Jobname should not be empty and should not contain \" \" character.") + ret = INVALID_JOBNAME + return ret + args.jobname=args.jobname.strip() + + if hasattr(args, 'volname'): + if (len(args.volname.split()) != 1): + output("Invalid Volname. Volname should not be empty and should not contain \" \" character.") + ret = INVALID_VOLNAME + return ret + args.volname=args.volname.strip() + + if hasattr(args, 'schedule'): + if (len(args.schedule.split()) != 5): + output("Invalid Schedule. Please refer to the following for adding a valid cron schedule") + print ("* * * * *") + print ("| | | | |") + print ("| | | | +---- Day of the Week (range: 1-7, 1 standing for Monday)") + print ("| | | +------ Month of the Year (range: 1-12)") + print ("| | +-------- Day of the Month (range: 1-31)") + print ("| +---------- Hour (range: 0-23)") + print ("+------------ Minute (range: 0-59)") + ret = INVALID_SCHEDULE + return ret + + ret = 0 + return ret + + +def perform_operation(args): + if not os.path.exists(CURRENT_SCHEDULER): + update_current_scheduler("none") + + # Initialise snapshot scheduler on local node + if args.action == "init": + ret = initialise_scheduler() + if ret != 0: + output("Failed to initialise snapshot scheduling") + gf_event (EVENT_SNAPSHOT_SCHEDULER_INIT_FAILED, + error=print_error(ret)) + return ret + + # Disable snapshot scheduler + if args.action == "disable_force": + ret = disable_scheduler() + if ret == 0: + subprocess.Popen(["touch", "-h", GCRON_TASKS]) + gf_event (EVENT_SNAPSHOT_SCHEDULER_DISABLED, + status="Successfully Disabled") + else: + gf_event (EVENT_SNAPSHOT_SCHEDULER_DISABLE_FAILED, + error=print_error(ret)) + return ret + + # Check if the symlink to GCRON_TASKS is properly set in the shared storage + if (not os.path.lexists(GCRON_UPDATE_TASK) or + not os.path.lexists(GCRON_CROND_TASK) or + os.readlink(GCRON_CROND_TASK) != GCRON_TASKS): + print_str = ("Please run 'snap_scheduler.py' init to initialise " + "the snap scheduler for the local node.") + log.error(print_str) + output(print_str) + ret = NODE_NOT_INITIALISED + return ret + + # Check status of snapshot scheduler. + if args.action == "status": + ret = scheduler_status() + if ret == 0: + if scheduler_enabled: + output("Snapshot scheduling status: Enabled") + else: + output("Snapshot scheduling status: Disabled") + else: + output("Failed to check status of snapshot scheduler") + return ret + + # Enable snapshot scheduler + if args.action == "enable": + ret = enable_scheduler() + if ret == 0: + subprocess.Popen(["touch", "-h", GCRON_TASKS]) + gf_event (EVENT_SNAPSHOT_SCHEDULER_ENABLED, + status="Successfully Enabled") + else: + gf_event (EVENT_SNAPSHOT_SCHEDULER_ENABLE_FAILED, + error=print_error(ret)) + return ret + + # Disable snapshot scheduler + if args.action == "disable": + ret = disable_scheduler() + if ret == 0: + subprocess.Popen(["touch", "-h", GCRON_TASKS]) + gf_event (EVENT_SNAPSHOT_SCHEDULER_DISABLED, + status="Successfully Disabled") + else: + gf_event (EVENT_SNAPSHOT_SCHEDULER_DISABLE_FAILED, + error=print_error(ret)) + return ret + + # List snapshot schedules + if args.action == "list": + ret = list_schedules() + return ret + + # Add snapshot schedules + if args.action == "add": + ret = syntax_checker(args) + if ret != 0: + return ret + ret = add_schedules(args.jobname, args.schedule, args.volname) + if ret == 0: + subprocess.Popen(["touch", "-h", GCRON_TASKS]) + gf_event (EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_ADDED, + status="Successfully added job "+args.jobname) + else: + gf_event (EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_ADD_FAILED, + status="Failed to add job "+args.jobname, + error=print_error(ret)) + return ret + + # Delete snapshot schedules + if args.action == "delete": + ret = syntax_checker(args) + if ret != 0: + return ret + ret = delete_schedules(args.jobname) + if ret == 0: + subprocess.Popen(["touch", "-h", GCRON_TASKS]) + gf_event (EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_DELETED, + status="Successfully deleted job "+args.jobname) + else: + gf_event (EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_DELETE_FAILED, + status="Failed to delete job "+args.jobname, + error=print_error(ret)) + return ret + + # Edit snapshot schedules + if args.action == "edit": + ret = syntax_checker(args) + if ret != 0: + return ret + ret = edit_schedules(args.jobname, args.schedule, args.volname) + if ret == 0: + subprocess.Popen(["touch", "-h", GCRON_TASKS]) + gf_event (EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_EDITED, + status="Successfully edited job "+args.jobname) + else: + gf_event (EVENT_SNAPSHOT_SCHEDULER_SCHEDULE_EDIT_FAILED, + status="Failed to edit job "+args.jobname, + error=print_error(ret)) + return ret + + ret = INVALID_ARG + return ret + +def gf_event(event_type, **kwargs): + if EVENTS_ENABLED: + from events.gf_event import gf_event as gfevent + gfevent(event_type, **kwargs) + + +def main(argv): + initLogger() + ret = -1 + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="action", + metavar=('{init, status, enable,' + ' disable, list, add,' + ' delete, edit}')) + subparsers.add_parser('init', + help="Initialise the node for snapshot scheduling") + + subparsers.add_parser("status", + help="Check if snapshot scheduling is " + "enabled or disabled") + subparsers.add_parser("enable", + help="Enable snapshot scheduling") + subparsers.add_parser("disable", + help="Disable snapshot scheduling") + subparsers.add_parser("disable_force") + subparsers.add_parser("list", + help="List snapshot schedules") + parser_add = subparsers.add_parser("add", + help="Add snapshot schedules") + parser_add.add_argument("jobname", help="Job Name") + parser_add.add_argument("schedule", help="Schedule") + parser_add.add_argument("volname", help="Volume Name") + + parser_delete = subparsers.add_parser("delete", + help="Delete snapshot schedules") + parser_delete.add_argument("jobname", help="Job Name") + parser_edit = subparsers.add_parser("edit", + help="Edit snapshot schedules") + parser_edit.add_argument("jobname", help="Job Name") + parser_edit.add_argument("schedule", help="Schedule") + parser_edit.add_argument("volname", help="Volume Name") + + args = parser.parse_args(argv) + + if not os.path.exists(SHARED_STORAGE_DIR): + output("Failed: "+SHARED_STORAGE_DIR+" does not exist.") + return SHARED_STORAGE_DIR_DOESNT_EXIST + + if not os.path.ismount(SHARED_STORAGE_DIR): + output("Failed: Shared storage is not mounted at "+SHARED_STORAGE_DIR) + return SHARED_STORAGE_NOT_MOUNTED + + if not os.path.exists(SHARED_STORAGE_DIR+"/snaps/"): + try: + os.makedirs(SHARED_STORAGE_DIR+"/snaps/") + except OSError as e: + if errno != EEXIST: + log.error("Failed to create %s : %s", SHARED_STORAGE_DIR+"/snaps/", e) + output("Failed to create %s. Error: %s" + % (SHARED_STORAGE_DIR+"/snaps/", e)) + return INTERNAL_ERROR + + if not os.path.exists(GCRON_ENABLED): + f = os.open(GCRON_ENABLED, os.O_CREAT | os.O_NONBLOCK, 0o644) + os.close(f) + + if not os.path.exists(LOCK_FILE_DIR): + try: + os.makedirs(LOCK_FILE_DIR) + except OSError as e: + if errno != EEXIST: + log.error("Failed to create %s : %s", LOCK_FILE_DIR, e) + output("Failed to create %s. Error: %s" + % (LOCK_FILE_DIR, e)) + return INTERNAL_ERROR + + try: + f = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR | os.O_NONBLOCK, 0o644) + try: + fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) + ret = perform_operation(args) + fcntl.flock(f, fcntl.LOCK_UN) + except IOError: + log.info("%s is being processed by another agent.", LOCK_FILE) + output("Another snap_scheduler command is running. " + "Please try again after some time.") + return ANOTHER_TRANSACTION_IN_PROGRESS + os.close(f) + except OSError as e: + log.error("Failed to open %s : %s", LOCK_FILE, e) + output("Failed to open %s. Error: %s" % (LOCK_FILE, e)) + return INTERNAL_ERROR + + return ret + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) |
