bug-make
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: race condition with reap_children when $(shell) used


From: Grant Taylor
Subject: Re: race condition with reap_children when $(shell) used
Date: Fri, 09 May 2003 18:56:27 -0400

>>>>> Grant Taylor <address@hidden> writes:

> If you use the -l loadavg job waiting feature, then when the load was
> or is too high, make may or may not obtain a token for jobs it puts in
> the waiting_jobs list.  This is because it obtains a token only if
> there are live children, and it is possible for there to be no live
> children yet many waiting children.  I believe it should always obtain
> (and yield) tokens if there are *either* waiting or live children,
> however doing this alone does not solve the problem.

OK, I've finally gotten this all fixed and working correctly.  Below
is my slightly messy patch for the following parallel/jobserver
issues:

 - Token leak from children of $(shell)

   Simple race against the children list; corrected here by directly
   calling waitpid for the shell incantation instead of reap_children.

   I've added a "reap_lock" variable which the reap_children() code
   checks, it unceremoniously dumps core if there is such a race.

 - Token leak when load too high condition toggles

   An issue with the jobserver read loop; the "free" token can only be
   used (via the break out of the read loop) when there are *neither*
   children nor waiting jobs.  This is because waiting jobs count as
   jobs too.  Not only must the break(s) be adjusted to check both,
   the loop must both reap children and move jobs from waiting to live
   before ever breaking; otherwise new jobs will sometimes be added to
   the live or waiting active set without reading a token.

   I've added a series of assertions, not least of which is "never
   sleep on read if there are no live children".

 - Small optimization to check for reduced load average during very
   long-running jobs, not just when a child ends.  

   This fixes a problem related to savannah bug 105.  If make starts 8
   jobs under -l4, the load will spike to 8ish.  If there are 7 small
   jobs that finish quickly and 1 huge job that runs for a while, then
   soon only 1 job will be left running.  The load will quickly drop
   back to 1, but make won't notice until the huge child finishes.

   I've added an extra way for the token read call to exit - a simple
   alarm(1) call just before the read (with a suitable signal handler,
   etc).  This way, when this situation happens, make will wake
   periodically and attempt to move jobs from waiting to live.  As the
   load drops, jobs will be added, and things run smoother.


diff -ur make-3.80/function.c make-3.80-gst/function.c
--- make-3.80/function.c        Fri Apr 25 10:12:51 2003
+++ make-3.80-gst/function.c    Fri Apr 11 19:05:21 2003
@@ -1622,8 +1632,32 @@
 
       /* Loop until child_handler sets shell_function_completed
         to the status of our child shell.  */
+#if 0
+      /* Badness: this can be called from between start_command and
+         the child list insertion, in which case we leak job tokens! */
       while (shell_function_completed == 0)
        reap_children (1, 0);
+#else
+      {
+       int wrval;
+       int exit_sig;
+       int exit_code;
+       int status;
+       do {
+         wrval = waitpid(pid, &status, 0);
+       } while(wrval == 0 || (wrval == -1 && errno == EINTR));
+       
+       exit_code = WEXITSTATUS (status);
+       if (exit_code == 0xff)
+         exit_code = -1;
+       exit_sig = WIFSIGNALED (status) ? WTERMSIG (status) : 0;
+       if (exit_sig==0 && exit_code == 127) {
+         shell_function_completed == -1;
+       } else {
+         shell_function_completed = 1;
+       }
+      }
+#endif
 
       if (batch_filename) {
        DB (DB_VERBOSE, (_("Cleaning up temporary batch file %s\n"),
diff -ur make-3.80/job.c make-3.80-gst/job.c
--- make-3.80/job.c     Fri Apr 25 10:12:51 2003
+++ make-3.80-gst/job.c Thu May  8 18:09:14 2003
@@ -211,6 +211,8 @@
 
 unsigned int job_slots_used = 0;
 
+unsigned int jobserver_tokens = 0;
+
 /* Nonzero if the `good' standard input is in use.  */
 
 static int good_stdin_used = 0;
@@ -406,7 +408,7 @@
 
 
 extern int shell_function_pid, shell_function_completed;
-
+static int reap_lock=0;
 /* Reap all dead children, storing the returned status and the new command
    state (`cs_finished') in the `file' member of the `struct child' for the
    dead child, and removing the child from the chain.  In addition, if BLOCK
@@ -428,6 +430,10 @@
 # define REAP_MORE dead_children
 #endif
 
+  if (reap_lock) {
+    abort();
+  }
+
   /* As long as:
 
        We have at least one child outstanding OR a shell function in progress,
@@ -475,17 +481,37 @@
 
       any_remote = 0;
       any_local = shell_function_pid != 0;
-      for (c = children; c != 0; c = c->next)
-       {
-         any_remote |= c->remote;
-         any_local |= ! c->remote;
-         DB (DB_JOBS, (_("Live child 0x%08lx (%s) PID %ld %s\n"),
-                        (unsigned long int) c, c->file->name,
-                        (long) c->pid, c->remote ? _(" (remote)") : ""));
+      {
+       int jobct=0;
+
+         for (c = children; c != 0; c = c->next)
+           {
+             jobct++;
+             any_remote |= c->remote;
+             any_local |= ! c->remote;
+             DB (DB_JOBS, (_("Live child 0x%08lx (%s) PID %ld %s make pid 
%d\n"),
+                           (unsigned long int) c, c->file->name,
+                           (long) c->pid, c->remote ? _(" (remote)") : "", 
getpid()));
 #ifdef VMS
-         break;
+             break;
 #endif
+           }
+
+       for (c = waiting_jobs; c != 0; c = c->next)
+         {
+           jobct++;
+           DB (DB_JOBS, (_("Waiting child 0x%08lx (%s) make pid %d\n"),
+                         (unsigned long int) c, c->file->name, getpid()));
+         }
+       DB (DB_JOBS, (_("jobserver_tokens=%d job_slots=%d job_slots_used=%d 
make pid %d\n"), 
+                     jobserver_tokens, job_slots, job_slots_used, getpid()));
+       if (jobct != jobserver_tokens+1) {
+         fprintf(stderr, "job/token mismatch %d != %d + 1\n",
+                 jobct, jobserver_tokens);
+         abort();
        }
+      }
+
 
       /* First, check for remote children.  */
       if (any_remote)
@@ -778,17 +804,33 @@
      token back for it.  This child has already been removed from the list,
      so if there any left this wasn't the last one.  */
 
-  if (job_fds[1] >= 0 && children)
+  DB (DB_JOBS, (_("free_child() jobserver_tokens=%d children=%p 
waiting_jobs=%p\n"), 
+               jobserver_tokens, children, waiting_jobs));
+
+  if (job_fds[1] >= 0 && (children || waiting_jobs))
     {
       char token = '+';
+      
 
       /* Write a job token back to the pipe.  */
+      if (!jobserver_tokens) {
+       fprintf(stderr, 
+               "make: error, no jobserver tokens in this proc, but writing one 
back!!!\n");
+       exit(1);
+      }
+
 
       if (write (job_fds[1], &token, 1) != 1)
        pfatal_with_name (_("write jobserver"));
 
-      DB (DB_JOBS, (_("Released token for child 0x%08lx (%s).\n"),
-                    (unsigned long int) child, child->file->name));
+      jobserver_tokens--;
+
+      DB (DB_JOBS, (_("Released token for child 0x%08lx (%s) now have %d in 
pid %d.\n"),
+                    (unsigned long int) child, child->file->name, 
jobserver_tokens, getpid()));
+    } else {
+      DB (DB_JOBS, (_("No token for child 0x%08lx (%s) children=%p, 
job_fds[0]=%d job_fds[1]=%d.\n"),
+                    (unsigned long int) child, child->file->name,
+                   children, job_fds[0], job_fds[1]));
     }
 
   if (handling_fatal_signal) /* Don't bother free'ing if about to die.  */
@@ -839,6 +881,10 @@
 }
 #endif
 
+static void noop(int sig) {
+
+}
+
 #ifdef MAKE_JOBSERVER
 /* Set the child handler action flags to FLAGS.  */
 static void
@@ -855,6 +901,10 @@
 #if defined SIGCLD && SIGCLD != SIGCHLD
   sigaction (SIGCLD, &sa, NULL);
 #endif
+
+  sa.sa_handler = noop;
+  sa.sa_flags = flags;
+  sigaction (SIGALRM, &sa, NULL);
 }
 #endif
 
@@ -1308,6 +1358,7 @@
     }
 
   /* Start the first command; reap_children will run later command lines.  */
+  reap_lock=1;
   start_job_command (c);
 
   switch (f->command_state)
@@ -1337,6 +1388,7 @@
       assert (f->command_state == cs_finished);
       break;
     }
+  reap_lock=0;
 
   return 1;
 }
@@ -1503,11 +1555,12 @@
        int got_token;
        int saved_errno;
 
-        DB (DB_JOBS, ("Need a job token; we %shave children\n",
-                      children ? "" : "don't "));
+        DB (DB_JOBS, ("Need a job token; we %s have children and %s have 
waiting jobs\n",
+                      children ? "DO" : "don't ",
+                     waiting_jobs ? "DO" : "don't"));
 
         /* If we don't already have a job started, use our "free" token.  */
-        if (!children)
+        if ((!children) && (!waiting_jobs))
           break;
 
         /* Read a token.  As long as there's no token available we'll block.
@@ -1542,21 +1595,36 @@
         /* Reap anything that's currently waiting.  */
         reap_children (0, 0);
 
-        /* If our "free" token has become available, use it.  */
-        if (!children)
-          break;
+       /* Kick off any jobs we have waiting for an opportunity that
+           can run now (ie waiting for load). */
+       start_waiting_jobs ();
+
+        /* If our "free" slot has become available, use it; we don't need an 
actual token.  */
+        if ((!children) && (!waiting_jobs))
+         break;
+
+       /* There must be at least one child already, or we have no business 
waiting for a token. */
+       if (!children) {
+         fprintf(stderr, "no children as we go to sleep on read, make pid 
%d\n", getpid());
+         abort();
+       }
 
         /* Set interruptible system calls, and read() for a job token.  */
        set_child_handler_action_flags (0);
+       alarm(1);               /* wake ocasionally, mostly to call
+                                   start_waiting_jobs in case load
+                                   dropped during some lengthy job. */
        got_token = read (job_rfd, &token, 1);
        saved_errno = errno;
+       alarm(0);
        set_child_handler_action_flags (SA_RESTART);
 
         /* If we got one, we're done here.  */
        if (got_token == 1)
           {
-            DB (DB_JOBS, (_("Obtained token for child 0x%08lx (%s).\n"),
-                          (unsigned long int) c, c->file->name));
+           jobserver_tokens++;
+            DB (DB_JOBS, (_("Obtained token for child 0x%08lx (%s) now have %d 
in pid %d.\n"),
+                          (unsigned long int) c, c->file->name, 
jobserver_tokens, getpid()));
             break;
           }
 
@@ -1565,13 +1633,15 @@
        errno = saved_errno;
         if (errno != EINTR && errno != EBADF)
           pfatal_with_name (_("read jobs pipe"));
-        if (errno == EBADF)
+        if (errno == EBADF) {
           DB (DB_JOBS, ("Read returned EBADF.\n"));
+       }
       }
 #endif
 
-  /* The job is now primed.  Start it running.
-     (This will notice if there are in fact no commands.)  */
+ start_new_job:
+  /* The job is now primed.  Start it running.  (This will notice if
+     there are in fact no commands.)  */
   (void) start_waiting_job (c);
 
   if (job_slots == 1 || not_parallel)

--
Grant Taylor - http://www.sw.starentnetworks.com/




reply via email to

[Prev in Thread] Current Thread [Next in Thread]