monit-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

pid and ppid change monitoring patch


From: Martin Pala
Subject: pid and ppid change monitoring patch
Date: Thu, 30 Dec 2004 16:42:15 +0100
User-agent: Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.3) Gecko/20040910

Hi,

here is patch which implements monitoring of process PID and PPID change. Currently monit doesn't recognize that the process was restarted in the case that the restart was faster then monit sleep time between testing cycles.

This patch makes the pid and ppid testing implicit (it is not needed to add any rule to the control file). The default action is 'alert'. The user may override the alert action using:

  if changed pid then <action>

or respectively by:

  if changed ppid then <action>


Example situations where it may be useful:

1.) For example sshd daemon can restart very quickly, thus if someone
changes its configuration and do sshd restart outside of monit
control, you will be notified that the process was replaced by
new instance (or you can optionaly do some other action such as
preventively stop sshd) => it may serve as security check.

2.) Another example is MySQL Cluster which has its own watchdog with
process restart ability. You can use monit for redundant monitoring.
Monit will just send alert in the case that the MySQL cluster
restarted the node quickly.


If you agree with the patch, i can add it to the cvs or do some changes if needed ;)


Martin
diff -Naur monit-mp-base/CHANGES.txt monit-mp/CHANGES.txt
--- monit-mp-base/CHANGES.txt   2004-12-29 13:33:29.452592000 +0000
+++ monit-mp/CHANGES.txt        2004-12-30 15:19:06.379238000 +0000
@@ -34,6 +34,16 @@
        allow 192.168.1.0/255.255.255.0
        allow 10.0.0.0/8
 
+*  Added process PID change test. This test is implicit and monit will
+   send alert by default. It is also possible to override the alert
+   action by explicit definition:
+       if changed pid then <action>
+
+*  Added process PPID (parent pid) change test. This test is implicit
+   and monit will send alert by default. It is also possible to override
+   the alert action by explicit definition:
+       if changed ppid then <action>
+
 *  Internal control file language changes. The char '=' is promoted to
    a keyword and not longer ignored by monit. This change should be
    backward compatible and shouldn't affect existing control files.
diff -Naur monit-mp-base/control.c monit-mp/control.c
--- monit-mp-base/control.c     2004-12-20 07:39:46.464861000 +0000
+++ monit-mp/control.c  2004-12-30 14:14:43.231530000 +0000
@@ -295,8 +295,8 @@
   monitor_unset(s);
 
   if(s->type==TYPE_PROCESS) {
-    /* Reset the proc info object in case of a later restart */
-    Util_resetProcInfo(s);
+    /* Reset the info object in case of a later restart */
+    Util_resetInfo(s);
   }
 
   if(s->stop && (s->type!=TYPE_PROCESS || Util_isProcessRunning(s))) {
@@ -363,8 +363,8 @@
   monitor_unset(s);
    
   if(s->type==TYPE_PROCESS) {
-    /* Reset the proc info object in case of a later restart */
-    Util_resetProcInfo(s);
+    /* Reset the info object in case of a later restart */
+    Util_resetInfo(s);
   }
   
 }
diff -Naur monit-mp-base/gc.c monit-mp/gc.c
--- monit-mp-base/gc.c  2004-12-20 07:39:48.057729000 +0000
+++ monit-mp/gc.c       2004-12-30 14:07:57.572431000 +0000
@@ -209,6 +209,12 @@
   if((*s)->action_TIMEOUT)
     _gc_eventaction(&(*s)->action_TIMEOUT);
   
+  if((*s)->action_PID)
+    _gc_eventaction(&(*s)->action_PID);
+  
+  if((*s)->action_PPID)
+    _gc_eventaction(&(*s)->action_PPID);
+  
   if((*s)->eventlist)
     _gc_event(&(*s)->eventlist);
   
diff -Naur monit-mp-base/http/cervlet.c monit-mp/http/cervlet.c
--- monit-mp-base/http/cervlet.c        2004-12-21 13:27:42.009647000 +0000
+++ monit-mp/http/cervlet.c     2004-12-30 15:26:28.113669000 +0000
@@ -1560,6 +1560,13 @@
 
 static void print_service_rules_process(HttpResponse res, Service_T s) {
 
+  if(s->type == TYPE_PROCESS) {
+    out_print(res, "<tr><td>Pid</td><td>If changed then %s</td></tr>\n",
+      actionnames[s->action_PID->failed->id]);
+    out_print(res, "<tr><td>Ppid</td><td>If changed then %s</td></tr>\n",
+      actionnames[s->action_PPID->failed->id]);
+  }
+
   if(s->resourcelist) {
 
     Resource_T    q;
diff -Naur monit-mp-base/l.l monit-mp/l.l
--- monit-mp-base/l.l   2004-12-20 07:39:48.102032000 +0000
+++ monit-mp/l.l        2004-12-29 16:16:35.917237000 +0000
@@ -247,6 +247,8 @@
 collector         { return COLLECTOR; }
 url               { return URL; }
 content           { return CONTENT; }
+pid               { return PID; }
+ppid              { return PPID; }
 {byte}            { return BYTE; }
 {kilobyte}        { return KILOBYTE; }
 {megabyte}        { return MEGABYTE; }
diff -Naur monit-mp-base/monit.pod monit-mp/monit.pod
--- monit-mp-base/monit.pod     2004-12-28 13:49:24.799510000 +0000
+++ monit-mp/monit.pod  2004-12-30 15:15:36.396658000 +0000
@@ -867,9 +867,9 @@
 watches whether the value will change again. You can use it just
 for alert or to involve some automatic action, as for example to
 reload monitored process after its configuration file was changed.
-Variable tests are supported for 'checksum', 'size' and 'timestamp'
-tests only, if you consider that other tests can be useful in
-variable form too, please let us know.
+Variable tests are supported for 'checksum', 'size', 'pid, 'ppid'
+and 'timestamp' tests only, if you consider that other tests can
+be useful in variable form too, please let us know.
 
 =over 4
 
@@ -1481,6 +1481,83 @@
 
 
 
+=head2 PID TESTING
+
+monit tests the process id (pid) of processes for change. This test
+is implicit and monit will send alert in the case of failure by
+default.
+
+You may override the default action using below rule (it may only
+be used within a process service entry in the monit control file).
+
+The syntax for the pid statement is:
+
+=over 4
+
+=item IF CHANGED PID THEN action
+
+=back
+
+I<action> is a choice of "ALERT", "RESTART", "START", "STOP",
+"EXEC" or "UNMONITOR".
+
+This test is useful to detect possible process restarts which
+has occured in the timeframe between two monit testing cycles.
+In the case that the restart was fast and the process provides
+expected service (i.e. all tests passed) you will be notified
+that the process was replaced.
+
+For example sshd daemon can restart very quickly, thus if someone
+changes its configuration and do sshd restart outside of monit
+control, you will be notified that the process was replaced by
+new instance (or you can optionaly do some other action such as
+preventively stop sshd).
+
+Another example is MySQL Cluster which has its own watchdog with
+process restart ability. You can use monit for redundant monitoring.
+Monit will just send alert in the case that the MySQL cluster
+restarted the node quickly.
+
+Example:
+
+ check process sshd with pidfile /var/run/sshd.pid
+       if changed pid then exec "/my/script"
+       alert address@hidden
+
+
+
+=head2 PPID TESTING
+
+monit tests the process parent id (ppid) of processes for change.
+This test is implicit and monit will send alert in the case of
+failure by default.
+
+You may override the default action using below rule (it may only
+be used within a process service entry in the monit control file).
+
+The syntax for the ppid statement is:
+
+=over 4
+
+=item IF CHANGED PPID THEN action
+
+=back
+
+I<action> is a choice of "ALERT", "RESTART", "START", "STOP",
+"EXEC" or "UNMONITOR".
+
+This test is useful to detect possible process parent change.
+This may happen normaly only in the case that the process parent
+exited.
+
+Example:
+
+ check process myproc with pidfile /var/run/myproc.pid
+       if changed ppid then exec "/my/script"
+       alert address@hidden
+
+
+
 =head2 CONNECTION TESTING
 
 Monit is able to perform connection testing via networked ports
@@ -2442,6 +2519,10 @@
  stop            The program used to stop the specified
                  service. Full path is required. This 
                  statement is optional, but recommended.
+ pid and ppid    These keywords may be used as standalone
+                 statements in a process service entry to
+                 override the alert action for change of
+                 process pid and ppid.
  uid and gid     These keywords are either 1) an optional part of
                  a start, stop or exec statement. They may be
                  used to specify a user id and a group id the
@@ -2597,10 +2678,10 @@
 I<ftp>, I<smtp>, I<pop>, I<nntp>, I<imap>, I<ssh>, I<dwp>,
 I<ldap2>, I<ldap3>, I<request>, I<cpu>, I<mem>, I<totalmem>,
 I<children>, I<loadavg>, I<timestamp>, I<changed>, I<second(s)>,
-I<minute(s)>, I<hour(s)>, I<day(s)>, I<space>, I<inode>,
-I<perm(ission)>, I<process>, I<file>, I<directory>, I<device>,
-I<size>, I<unmonitor>, I<rdate>, I<rsync>, I<data>, I<invalid>,
-I<exec>, I<nonexist> and I<failed>
+I<minute(s)>, I<hour(s)>, I<day(s)>, I<space>, I<inode>, I<pid>,
+I<ppid>, I<perm(ission)>, I<process>, I<file>, I<directory>,
+I<device>, I<size>, I<unmonitor>, I<rdate>, I<rsync>, I<data>,
+I<invalid>, I<exec>, I<nonexist> and I<failed>
 
 And here is a complete list of B<noise keywords> ignored by
 monit:
diff -Naur monit-mp-base/monitor.h monit-mp/monitor.h
--- monit-mp-base/monitor.h     2004-12-20 07:39:48.270167000 +0000
+++ monit-mp/monitor.h  2004-12-30 14:08:42.783756000 +0000
@@ -596,12 +596,14 @@
   long   space_total;                           /**< Used space total blocks */
 
   /* FIle specific */
-  size_t st_size;                                                 /**< Size */
-  char  *cs_sum;                                              /**< Checksum */
+  size_t st_size;                                                  /**< Size */
+  char  *cs_sum;                                               /**< Checksum */
 
   /* Process specific */
-  int    pid;
-  int    ppid;
+  int    _pid;                              /**< Process PID from last cycle */
+  int    _ppid;                      /**< Process parent PID from last cycle */
+  int    pid;                             /**< Process PID from actual cycle */
+  int    ppid;                     /**< Process parent PID from actual cycle */
   int    status_flag;
   int    children;
   long   mem_kbyte;    
@@ -654,6 +656,9 @@
   Uid_T       uid;                                            /**< Uid check */
   URL_T       urllist;                    /**< URLs to check for the service */
   
+  EventAction_T action_PID;   /**< Description of the action upon pid change */
+  EventAction_T action_PPID; /**< Description of the action upon ppid change */
+
   /** General event handlers */
   EventAction_T action_DATA;       /**< Description of the action upon event */
   EventAction_T action_EXEC;       /**< Description of the action upon event */
diff -Naur monit-mp-base/monitrc monit-mp/monitrc
--- monit-mp-base/monitrc       2004-12-28 13:49:24.919893000 +0000
+++ monit-mp/monitrc    2004-12-30 14:43:56.683855000 +0000
@@ -46,6 +46,14 @@
 #                     be  used as a standalone statement in a file service 
 #                     check entry to check for changes in gid.
 #
+#   pid            -- This statement is an optionally part of process service
+#                     check entry to override default alert action in the case
+#                     of pid change.
+#
+#   ppid           -- This statement is an optionally part of process service
+#                     check entry to override default alert action in the case
+#                     of parent pid change.
+#
 #   host           -- Specify a hostname or an ip-address to test port 
 #                     connection at. This statement must be followed by a
 #                     port statement.
diff -Naur monit-mp-base/p.y monit-mp/p.y
--- monit-mp-base/p.y   2004-12-28 13:49:25.096891000 +0000
+++ monit-mp/p.y        2004-12-30 14:21:06.942154000 +0000
@@ -196,6 +196,7 @@
   static void  addeuid(uid_t);
   static void  addegid(gid_t);
   static void  addeventaction(EventAction_T *, int, int);
+  static void  seteventaction(EventAction_T *, int, int);
   static void  prepare_urlrequest(URL_T U);
   static void  seturlrequest(int, char *);
   static void  setlogfile(char *);
@@ -261,7 +262,7 @@
 %token BYTE KILOBYTE MEGABYTE GIGABYTE
 %token INODE SPACE PERMISSION SIZE
 %token EXEC UNMONITOR ICMP ICMPECHO NONEXIST INVALID DATA RECOVERED
-%token URL CONTENT
+%token URL CONTENT PID PPID
 %token <url> URLOBJECT
 
 %left GREATER LESS EQUAL NOTEQUAL
@@ -299,6 +300,8 @@
 
 optproc         : start
                 | stop
+                | pid
+                | ppid
                 | connection
                 | connectionunix
                 | timeout
@@ -829,6 +832,18 @@
                  }
                 ;
 
+pid             : IF CHANGED PID THEN action1 {
+                    seteventaction(&(current)->action_PID, $<number>5,
+                      ACTION_IGNORE);
+                  }
+                ;
+
+ppid            : IF CHANGED PPID THEN action1 {
+                    seteventaction(&(current)->action_PPID, $<number>5,
+                      ACTION_IGNORE);
+                  }
+                ;
+
 nettimeout      : /* EMPTY */ {
                    $<number>$= NET_TIMEOUT;
                   }
@@ -1478,6 +1493,7 @@
   current->check= check;
 
   createinfo();
+  Util_resetInfo(current);
 
   /* Initialize general event handlers */
   addeventaction(&(current)->action_DATA,     ACTION_ALERT,     ACTION_ALERT);
@@ -1485,6 +1501,8 @@
   addeventaction(&(current)->action_INVALID,  ACTION_RESTART,   ACTION_ALERT);
   addeventaction(&(current)->action_NONEXIST, ACTION_RESTART,   ACTION_ALERT);
   addeventaction(&(current)->action_TIMEOUT,  ACTION_UNMONITOR, ACTION_ALERT);
+  addeventaction(&(current)->action_PID,      ACTION_ALERT,     ACTION_IGNORE);
+  addeventaction(&(current)->action_PPID,     ACTION_ALERT,     ACTION_IGNORE);
   
   pthread_mutex_init(&current->mutex, NULL);
 
@@ -1940,6 +1958,33 @@
 
 
 /*
+ * Redefine EventAction object (used for default action overloading)
+ */
+static void seteventaction(EventAction_T *_ea, int failed, int passed) {
+
+  EventAction_T ea = *_ea;
+
+  ASSERT(ea);
+  ASSERT(ea->failed);
+  ASSERT(ea->passed);
+
+  ea->failed->id= failed;
+  ea->passed->id= passed;
+
+  if(failed == ACTION_EXEC) {
+    ASSERT(command1);
+    ea->failed->exec = command1;
+    command1 = NULL;
+  }
+  if(passed == ACTION_EXEC) {
+    ASSERT(command2);
+    ea->passed->exec = command2;
+    command2 = NULL;
+  }
+}
+
+
+/*
  * Adds runtime info to current service
  */
 static void createinfo() {
@@ -1947,6 +1992,7 @@
   Info_T inf;
 
   NEW(inf);
+  
   current->inf= inf;
   
 }
diff -Naur monit-mp-base/process.c monit-mp/process.c
--- monit-mp-base/process.c     2004-10-18 08:06:33.228789000 +0000
+++ monit-mp/process.c  2004-12-30 15:00:15.321853000 +0000
@@ -124,11 +124,15 @@
   ASSERT(s);
   ASSERT(systeminfo.mem_kbyte_max > 0);
 
-  s->inf->pid=pid;
+  /* save the previous pid and set actual one */
+  s->inf->_pid= s->inf->pid;
+  s->inf->pid = pid;
 
   if ((leaf = findprocess(pid, pt, treesize)) != NULL ) {
  
-    s->inf->ppid=leaf->ppid;
+    /* save the previous ppid and set actual one */
+    s->inf->_ppid= s->inf->ppid;
+    s->inf->ppid= leaf->ppid;
     s->inf->children=leaf->children_sum;
     s->inf->mem_kbyte=leaf->mem_kbyte;
     s->inf->status_flag=leaf->status_flag;
diff -Naur monit-mp-base/util.c monit-mp/util.c
--- monit-mp-base/util.c        2004-12-20 07:39:48.676209000 +0000
+++ monit-mp/util.c     2004-12-30 14:58:30.957380000 +0000
@@ -646,6 +646,13 @@
     if(d->dependant != NULL)
       printf(" %-20s = %s\n", "Depends on Service", d->dependant);
 
+  if(s->type == TYPE_PROCESS) {
+    printf(" %-20s = if changed then %s\n",
+      "Pid", actionnames[s->action_PID->failed->id]);
+    printf(" %-20s = if changed then %s\n",
+      "Ppid", actionnames[s->action_PPID->failed->id]);
+  }
+
   if(s->checksum && s->checksum->action) {
     Checksum_T cs= s->checksum;
     EventAction_T a= cs->action;
@@ -1031,7 +1038,7 @@
   if((pid= Util_getPid(s->path)))
     if( (getpgid(pid) > -1) || (errno == EPERM) )
       return pid;
-  Util_resetProcInfo(s);
+  Util_resetInfo(s);
   
   return FALSE;
   
@@ -1523,10 +1530,14 @@
 
 
 /**
- * Reset the process information structure
+ * Reset the service information structure
  */
-void Util_resetProcInfo(Service_T s) {
+void Util_resetInfo(Service_T s) {
   memset(s->inf, 0, sizeof *(s->inf));
+  s->inf->_pid=  -1;
+  s->inf->_ppid= -1;
+  s->inf->pid=   -1;
+  s->inf->ppid=  -1;
 }
 
 
diff -Naur monit-mp-base/util.h monit-mp/util.h
--- monit-mp-base/util.h        2004-12-20 05:39:34.000000000 +0000
+++ monit-mp/util.h     2004-12-30 14:16:00.005220000 +0000
@@ -330,9 +330,9 @@
 
 
 /**
- * Reset the process information structure
+ * Reset the service information structure
  */
-void Util_resetProcInfo(Service_T s);
+void Util_resetInfo(Service_T s);
 
 
 /**
diff -Naur monit-mp-base/validate.c monit-mp/validate.c
--- monit-mp-base/validate.c    2004-12-20 07:39:48.749276000 +0000
+++ monit-mp/validate.c 2004-12-30 14:51:38.620346000 +0000
@@ -107,6 +107,8 @@
 static void check_checksum(Service_T);
 static void check_timestamp(Service_T);
 static void check_process_state(Service_T);
+static void check_process_pid(Service_T);
+static void check_process_ppid(Service_T);
 static void check_connection(Service_T, Port_T);
 static void check_device_resources(Service_T, Device_T);
 static void check_process_resources(Service_T, Resource_T);
@@ -176,8 +178,8 @@
 
   /* Test for running process */
   if(!(pid= Util_isProcessRunning(s))) {
-    /* Reset the proc info object to prevent false data in the first run */
-    Util_resetProcInfo(s);
+    /* Reset the service info object to prevent false data in the first run */
+    Util_resetInfo(s);
     Event_post(s, EVENT_NONEXIST, TRUE, s->action_NONEXIST,
       "'%s' process is not running", s->name);
     return FALSE;
@@ -191,6 +193,8 @@
   if(Run.doprocess) {
     if(update_process_data(s, ptree, ptreesize, pid)) {
       check_process_state(s);
+      check_process_pid(s);
+      check_process_ppid(s);
       for(pr= s->resourcelist; pr; pr= pr->next) {
         check_process_resources(s, pr);
       }
@@ -529,6 +533,48 @@
 
 
 /**
+ * Test process pid for possible change since last cycle
+ */
+static void check_process_pid(Service_T s) {
+
+  ASSERT(s && s->inf);
+
+  /* process pid was not initialized yet */
+  if(s->inf->_pid == -1)
+    return;
+
+  if(s->inf->_pid != s->inf->pid) {
+    Event_post(s, EVENT_CHANGED, TRUE, s->action_PID,
+      "'%s' process PID changed to %d", s->name, s->inf->pid);
+  } else {
+    Event_post(s, EVENT_CHANGED, FALSE, s->action_PID,
+      "'%s' PID has not changed", s->name);
+  }
+}
+
+
+/**
+ * Test process ppid for possible change since last cycle
+ */
+static void check_process_ppid(Service_T s) {
+
+  ASSERT(s && s->inf);
+
+  /* process ppid was not initialized yet */
+  if(s->inf->_ppid == -1)
+    return;
+
+  if(s->inf->_ppid != s->inf->ppid) {
+    Event_post(s, EVENT_CHANGED, TRUE, s->action_PPID,
+      "'%s' process PPID changed to %d", s->name, s->inf->ppid);
+  } else {
+    Event_post(s, EVENT_CHANGED, FALSE, s->action_PPID,
+      "'%s' PPID has not changed", s->name);
+  }
+}
+
+
+/**
  * Check process resources
  */
 static void check_process_resources(Service_T s, Resource_T pr) {

reply via email to

[Prev in Thread] Current Thread [Next in Thread]