guix-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[no subject]


From: Ludovic Courtès
Date: Thu, 28 Sep 2023 06:53:42 -0400 (EDT)

branch: master
commit 980ef610989895be5ac2ba7f9d1901e5c7f22934
Author: Ludovic Courtès <ludo@gnu.org>
AuthorDate: Tue Sep 26 15:16:22 2023 +0200

    remote-worker: Statically determine build process parallelism.
    
    Previously, build jobs would use the default #:max-build-jobs
    and #:build-cores specified by guix-daemon.  This would typically lead
    each worker to use as many cores as available, leading to unreasonable
    over-commitment.
    
    With this change, each worker is assigned a fraction of the build cores.
    Because it’s a static policy, it may lead to resource waste, but avoids
    the problem mentioned above.
    
    * src/cuirass/scripts/remote-worker.scm (run-build): Add #:parallelism
    and pass it to ‘set-build-options*’.
    (run-command): Add #:parallelism and pass it to ‘run-build’.
    (start-worker): Add #:parallelism and pass it to ‘run-command’.
    (worker-management-thunk): Pass #:parallelism to ‘start-worker’.
    * src/cuirass/remote.scm (set-build-options*): Add #:build-cores and
    pass it to ‘set-build-options’, along with #:max-build-jobs.
---
 src/cuirass/remote.scm                |  9 ++++++-
 src/cuirass/scripts/remote-worker.scm | 44 ++++++++++++++++++++++-------------
 2 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/src/cuirass/remote.scm b/src/cuirass/remote.scm
index aed2552..0188596 100644
--- a/src/cuirass/remote.scm
+++ b/src/cuirass/remote.scm
@@ -40,6 +40,7 @@
   #:use-module (ice-9 match)
   #:use-module (ice-9 rdelim)
   #:use-module (ice-9 suspendable-ports)
+  #:autoload   (ice-9 threads) (current-processor-count)
   #:use-module (fibers)
   #:use-module (fibers scheduler)
   #:export (worker
@@ -209,6 +210,7 @@ given NAME."
 
 (define* (set-build-options* store urls
                              #:key
+                             (build-cores (current-processor-count))
                              timeout
                              max-silent)
   "Use URLS as substitution servers, set TIMEOUT and MAX-SILENT store
@@ -220,7 +222,12 @@ properties."
                      #:timeout timeout
                      #:max-silent-time max-silent
                      #:verbosity 1
-                     #:substitute-urls urls))
+                     #:substitute-urls urls
+
+                     ;; Each worker uses up to BUILD-CORES for its build
+                     ;; processes and at most one build process at a time.
+                     #:build-cores build-cores
+                     #:max-build-jobs 1))
 
 (define* (publish-server port
                          #:key
diff --git a/src/cuirass/scripts/remote-worker.scm 
b/src/cuirass/scripts/remote-worker.scm
index 85dcb5d..7b9bd8a 100644
--- a/src/cuirass/scripts/remote-worker.scm
+++ b/src/cuirass/scripts/remote-worker.scm
@@ -206,9 +206,11 @@ ADDRESS and PORT."
                     reply
                     timeout
                     max-silent
+                    (parallelism (current-processor-count))
                     worker)
   "Build DRV and send messages upon build start, failure or completion to the
-build server identified by SERVICE-NAME using the REPLY procedure.
+build server identified by SERVICE-NAME using the REPLY procedure.  Each build
+process may use up to PARALLELISM cores.
 
 The publish server of the build server is added to the list of the store
 substitutes-urls.  This way derivations that are not present on the worker can
@@ -219,9 +221,12 @@ still be substituted."
           (publish-url (server-publish-url server))
           (local-publish-url (worker-publish-url worker))
           (name (worker-name worker)))
+      ;; TODO: Choose PARALLELISM dynamically based on the number of currently
+      ;; running jobs and/or the current load.
       (set-build-options* store (if publish-url
                                     (cons publish-url (%substitute-urls))
                                     (%substitute-urls))
+                          #:build-cores parallelism
                           #:timeout timeout
                           #:max-silent max-silent)
       (reply (build-started-message drv name))
@@ -255,7 +260,7 @@ still be substituted."
 
 (define* (run-command command server
                       #:key
-                      reply worker)
+                      reply worker (parallelism (current-processor-count)))
   "Run COMMAND.  SERVICE-NAME is the name of the build server that sent the
 command.  REPLY is a procedure that can be used to reply to this server."
   (match command
@@ -270,6 +275,7 @@ command.  REPLY is a procedure that can be used to reply to 
this server."
      (run-build drv server
                 #:reply reply
                 #:worker worker
+                #:parallelism parallelism
                 #:timeout timeout
                 #:max-silent max-silent))))
 
@@ -298,9 +304,10 @@ command.  REPLY is a procedure that can be used to reply 
to this server."
       (< (free-disk-space (or (getenv "TMPDIR") "/tmp"))
          (%minimum-disk-space))))
 
-(define (start-worker wrk serv)
+(define* (start-worker wrk serv #:key (parallelism (current-processor-count)))
   "Start a worker thread named NAME, reading commands from the DEALER socket
-and executing them.  The worker can reply on the same socket."
+and executing them.  The worker can reply on the same socket.  Each build
+process can use up to PARALLELISM cores."
   (define (reply socket)
     (lambda (message)
       (send-message socket message)))
@@ -388,7 +395,8 @@ and executing them.  The worker can reply on the same 
socket."
                                (worker-name wrk) command)
                     (run-command command server
                                  #:reply (reply socket)
-                                 #:worker worker)))))
+                                 #:worker worker
+                                 #:parallelism parallelism)))))
 
            (loop)))))))
 
@@ -399,17 +407,21 @@ SYSTEMS."
     (let loop ()
       (match (get-message channel)
         (`(start-workers ,count ,server ,local-address)
-         (log-info "starting ~a workers for server at ~a"
-                   count (server-address server))
-         (let spawn ((i 0))
-           (when (< i count)
-             (start-worker (worker (name (generate-worker-name))
-                                   (address local-address)
-                                   (machine (gethostname))
-                                   (publish-url (local-publish-url 
local-address))
-                                   (systems systems))
-                           server)
-             (spawn (+ i 1))))))
+         (let ((parallelism (max (quotient (current-processor-count) count)
+                                 1)))
+           (log-info
+            "starting ~a workers (parallelism: ~a cores) for server at ~a"
+            count parallelism (server-address server))
+           (let spawn ((i 0))
+             (when (< i count)
+               (start-worker (worker (name (generate-worker-name))
+                                     (address local-address)
+                                     (machine (gethostname))
+                                     (publish-url (local-publish-url 
local-address))
+                                     (systems systems))
+                             server
+                             #:parallelism parallelism)
+               (spawn (+ i 1)))))))
       (loop))))
 
 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]