[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[no subject]
From: |
Ludovic Courtès |
Date: |
Thu, 28 Sep 2023 06:53:42 -0400 (EDT) |
branch: master
commit 980ef610989895be5ac2ba7f9d1901e5c7f22934
Author: Ludovic Courtès <ludo@gnu.org>
AuthorDate: Tue Sep 26 15:16:22 2023 +0200
remote-worker: Statically determine build process parallelism.
Previously, build jobs would use the default #:max-build-jobs
and #:build-cores specified by guix-daemon. This would typically lead
each worker to use as many cores as available, leading to unreasonable
over-commitment.
With this change, each worker is assigned a fraction of the build cores.
Because it’s a static policy, it may lead to resource waste, but avoids
the problem mentioned above.
* src/cuirass/scripts/remote-worker.scm (run-build): Add #:parallelism
and pass it to ‘set-build-options*’.
(run-command): Add #:parallelism and pass it to ‘run-build’.
(start-worker): Add #:parallelism and pass it to ‘run-command’.
(worker-management-thunk): Pass #:parallelism to ‘start-worker’.
* src/cuirass/remote.scm (set-build-options*): Add #:build-cores and
pass it to ‘set-build-options’, along with #:max-build-jobs.
---
src/cuirass/remote.scm | 9 ++++++-
src/cuirass/scripts/remote-worker.scm | 44 ++++++++++++++++++++++-------------
2 files changed, 36 insertions(+), 17 deletions(-)
diff --git a/src/cuirass/remote.scm b/src/cuirass/remote.scm
index aed2552..0188596 100644
--- a/src/cuirass/remote.scm
+++ b/src/cuirass/remote.scm
@@ -40,6 +40,7 @@
#:use-module (ice-9 match)
#:use-module (ice-9 rdelim)
#:use-module (ice-9 suspendable-ports)
+ #:autoload (ice-9 threads) (current-processor-count)
#:use-module (fibers)
#:use-module (fibers scheduler)
#:export (worker
@@ -209,6 +210,7 @@ given NAME."
(define* (set-build-options* store urls
#:key
+ (build-cores (current-processor-count))
timeout
max-silent)
"Use URLS as substitution servers, set TIMEOUT and MAX-SILENT store
@@ -220,7 +222,12 @@ properties."
#:timeout timeout
#:max-silent-time max-silent
#:verbosity 1
- #:substitute-urls urls))
+ #:substitute-urls urls
+
+ ;; Each worker uses up to BUILD-CORES for its build
+ ;; processes and at most one build process at a time.
+ #:build-cores build-cores
+ #:max-build-jobs 1))
(define* (publish-server port
#:key
diff --git a/src/cuirass/scripts/remote-worker.scm
b/src/cuirass/scripts/remote-worker.scm
index 85dcb5d..7b9bd8a 100644
--- a/src/cuirass/scripts/remote-worker.scm
+++ b/src/cuirass/scripts/remote-worker.scm
@@ -206,9 +206,11 @@ ADDRESS and PORT."
reply
timeout
max-silent
+ (parallelism (current-processor-count))
worker)
"Build DRV and send messages upon build start, failure or completion to the
-build server identified by SERVICE-NAME using the REPLY procedure.
+build server identified by SERVICE-NAME using the REPLY procedure. Each build
+process may use up to PARALLELISM cores.
The publish server of the build server is added to the list of the store
substitutes-urls. This way derivations that are not present on the worker can
@@ -219,9 +221,12 @@ still be substituted."
(publish-url (server-publish-url server))
(local-publish-url (worker-publish-url worker))
(name (worker-name worker)))
+ ;; TODO: Choose PARALLELISM dynamically based on the number of currently
+ ;; running jobs and/or the current load.
(set-build-options* store (if publish-url
(cons publish-url (%substitute-urls))
(%substitute-urls))
+ #:build-cores parallelism
#:timeout timeout
#:max-silent max-silent)
(reply (build-started-message drv name))
@@ -255,7 +260,7 @@ still be substituted."
(define* (run-command command server
#:key
- reply worker)
+ reply worker (parallelism (current-processor-count)))
"Run COMMAND. SERVICE-NAME is the name of the build server that sent the
command. REPLY is a procedure that can be used to reply to this server."
(match command
@@ -270,6 +275,7 @@ command. REPLY is a procedure that can be used to reply to
this server."
(run-build drv server
#:reply reply
#:worker worker
+ #:parallelism parallelism
#:timeout timeout
#:max-silent max-silent))))
@@ -298,9 +304,10 @@ command. REPLY is a procedure that can be used to reply
to this server."
(< (free-disk-space (or (getenv "TMPDIR") "/tmp"))
(%minimum-disk-space))))
-(define (start-worker wrk serv)
+(define* (start-worker wrk serv #:key (parallelism (current-processor-count)))
"Start a worker thread named NAME, reading commands from the DEALER socket
-and executing them. The worker can reply on the same socket."
+and executing them. The worker can reply on the same socket. Each build
+process can use up to PARALLELISM cores."
(define (reply socket)
(lambda (message)
(send-message socket message)))
@@ -388,7 +395,8 @@ and executing them. The worker can reply on the same
socket."
(worker-name wrk) command)
(run-command command server
#:reply (reply socket)
- #:worker worker)))))
+ #:worker worker
+ #:parallelism parallelism)))))
(loop)))))))
@@ -399,17 +407,21 @@ SYSTEMS."
(let loop ()
(match (get-message channel)
(`(start-workers ,count ,server ,local-address)
- (log-info "starting ~a workers for server at ~a"
- count (server-address server))
- (let spawn ((i 0))
- (when (< i count)
- (start-worker (worker (name (generate-worker-name))
- (address local-address)
- (machine (gethostname))
- (publish-url (local-publish-url
local-address))
- (systems systems))
- server)
- (spawn (+ i 1))))))
+ (let ((parallelism (max (quotient (current-processor-count) count)
+ 1)))
+ (log-info
+ "starting ~a workers (parallelism: ~a cores) for server at ~a"
+ count parallelism (server-address server))
+ (let spawn ((i 0))
+ (when (< i count)
+ (start-worker (worker (name (generate-worker-name))
+ (address local-address)
+ (machine (gethostname))
+ (publish-url (local-publish-url
local-address))
+ (systems systems))
+ server
+ #:parallelism parallelism)
+ (spawn (+ i 1)))))))
(loop))))