summaryrefslogtreecommitdiffstats
path: root/xlators/features/marker/utils/syncdaemon/monitor.py
blob: a86acdc75666098b15327501c0ca307db4043be7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import sys
import time
import logging
import select
from signal import SIGKILL
from gconf import gconf
from syncdutils import update_file

class Monitor(object):

    def __init__(self):
        self.state = None

    def set_state(self, state):
        if state == self.state:
            return
        self.state = state
        logging.info('new state: %s' % state)
        if getattr(gconf, 'state_file', None):
            update_file(gconf.state_file, lambda f: f.write(state + '\n'))

    def monitor(self):
        argv = sys.argv[:]
        for o in ('-N', '--no-daemon', '--monitor'):
            while o in argv:
                argv.remove(o)
        argv.extend(('-N', '-p', ''))
        argv.insert(0, os.path.basename(sys.executable))

        self.set_state('starting...')
        ret = 0
        def nwait(p, o=0):
            p2, r = os.waitpid(p, o)
            if not p2:
                return
            if os.WIFEXITED(r):
                return os.WEXITSTATUS(r)
            return 1
        conn_timeout = 60
        while ret in (0, 1):
            logging.info('-' * conn_timeout)
            logging.info('starting gsyncd worker')
            pr, pw = os.pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.execv(sys.executable, argv + ['--feedback-fd', str(pw)])
            os.close(pw)
            t0 = time.time()
            select.select((pr,), (), (), conn_timeout)
            os.close(pr)
            et = time.time() - t0
            if et < conn_timeout:
                et2 = conn_timeout - et
                logging.debug("worker got connected in %d sec, "
                              "waiting %d more to make sure it's fine" % (et, et2))
                time.sleep(et2)
                ret = nwait(cpid, os.WNOHANG)
            else:
                logging.debug("worker not confirmed in %d sec, aborting it" % et)
                os.kill(cpid, SIGKILL)
                ret = nwait(cpid)
            if ret == None:
                self.set_state('OK')
                ret = nwait(cpid)
            elif ret in (0, 1):
                self.set_state('faulty')
            time.sleep(10)
        self.set_state('inconsistent')
        return ret

def monitor():
    return Monitor().monitor()