bug-guix
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#41625: [PATCH] offload: Handle a possible EOF response from read-rep


From: Maxim Cournoyer
Subject: bug#41625: [PATCH] offload: Handle a possible EOF response from read-repl-response.
Date: Tue, 25 May 2021 11:50:03 -0400

Fixes <https://issues.guix.gnu.org/41625>.

* guix/scripts/offload.scm (check-machine-availability): Refactor so that it
takes a single machine object, to allow for retrying a single machine.  Handle
the case where the checks raised an exception due to the connection to the
build machine having been lost, and retry up to 3 times.  Ensure the cleanup
code is run in all situations.
(check-machines-availability): New procedure.  Call
CHECK-MACHINES-AVAILABILITY in parallel, which improves performance (about
twice as fast with 4 build machines, from ~30 s to ~15 s).
* guix/inferior.scm (&inferior-connection-lost): New condition type.
(read-repl-response): Raise a condition of the above type when reading EOF
from the build machine's port.
---
 guix/inferior.scm        |  9 ++++++++
 guix/scripts/offload.scm | 50 +++++++++++++++++++++++++++++-----------
 2 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/guix/inferior.scm b/guix/inferior.scm
index 7c8e478f2a..4ac1ea3484 100644
--- a/guix/inferior.scm
+++ b/guix/inferior.scm
@@ -1,5 +1,6 @@
 ;;; GNU Guix --- Functional package management for GNU
 ;;; Copyright © 2018, 2019, 2020, 2021 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2021 Maxim Cournoyer <maxim.cournoyer@gmail.com>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -70,6 +71,7 @@
             inferior-exception-arguments
             inferior-exception-inferior
             inferior-exception-stack
+            inferior-connection-lost?
             read-repl-response
 
             inferior-packages
@@ -228,6 +230,9 @@ equivalent.  Return #f if the inferior could not be 
launched."
   (inferior   inferior-exception-inferior)        ;<inferior> | #f
   (stack      inferior-exception-stack))          ;list of (FILE COLUMN LINE)
 
+(define-condition-type &inferior-connection-lost &error
+  inferior-connection-lost?)
+
 (define* (read-repl-response port #:optional inferior)
   "Read a (guix repl) response from PORT and return it as a Scheme object.
 Raise '&inferior-exception' when an exception is read from PORT."
@@ -241,6 +246,10 @@ Raise '&inferior-exception' when an exception is read from 
PORT."
   (match (read port)
     (('values objects ...)
      (apply values (map sexp->object objects)))
+    ;; Unexpectedly read EOF from the port.  This can happen for example when
+    ;; the underlying connection for PORT was lost with Guile-SSH.
+    (? eof-object?
+       (raise (condition (&inferior-connection-lost))))
     (('exception ('arguments key objects ...)
                  ('stack frames ...))
      ;; Protocol (0 1 1) and later.
diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm
index 835078cb97..7c6d38b218 100644
--- a/guix/scripts/offload.scm
+++ b/guix/scripts/offload.scm
@@ -1,7 +1,7 @@
 ;;; GNU Guix --- Functional package management for GNU
 ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès 
<ludo@gnu.org>
 ;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
-;;; Copyright © 2020 Maxim Cournoyer <maxim.cournoyer@gmail.com>
+;;; Copyright © 2020, 2021 Maxim Cournoyer <maxim.cournoyer@gmail.com>
 ;;; Copyright © 2020 Julien Lepiller <julien@lepiller.eu>
 ;;;
 ;;; This file is part of GNU Guix.
@@ -53,6 +53,7 @@
   #:use-module (ice-9 regex)
   #:use-module (ice-9 format)
   #:use-module (ice-9 binary-ports)
+  #:use-module (ice-9 threads)
   #:export (build-machine
             build-machine?
             build-machine-name
@@ -684,7 +685,7 @@ daemon is not running."
           (leave (G_ "failed to import '~a' from '~a'~%")
                  item name)))))
 
-(define (check-machine-availability machine-file pred)
+(define (check-machines-availability machine-file pred)
   "Check that each machine matching PRED in MACHINE-FILE is usable as a build
 machine."
   (define (build-machine=? m1 m2)
@@ -696,18 +697,39 @@ machine."
   (let ((machines (filter pred
                           (delete-duplicates (build-machines machine-file)
                                              build-machine=?))))
-    (info (G_ "testing ~a build machines defined in '~a'...~%")
+    (info (G_ "Testing ~a build machines defined in '~a'...~%")
           (length machines) machine-file)
-    (let* ((names    (map build-machine-name machines))
-           (sockets  (map build-machine-daemon-socket machines))
-           (sessions (map (cut open-ssh-session <> %short-timeout) machines))
-           (nodes    (map remote-inferior sessions)))
-      (for-each assert-node-has-guix nodes names)
-      (for-each assert-node-repl nodes names)
-      (for-each assert-node-can-import sessions nodes names sockets)
-      (for-each assert-node-can-export sessions nodes names sockets)
-      (for-each close-inferior nodes)
-      (for-each disconnect! sessions))))
+    (par-for-each check-machine-availability machines)))
+
+(define (check-machine-availability machine)
+  "Check whether MACHINE is available.  Exit with an error upon failure."
+  ;; Sometimes, the machine remote port may return EOF, presumably because the
+  ;; connection was lost.  Retry up to 3 times.
+  (let loop ((retries 3))
+    (guard (c ((inferior-connection-lost? c)
+               (let ((retries-left (1- retries)))
+                 (if (> retries-left 0)
+                     (begin
+                       (format (current-error-port)
+                               (G_ "connection to machine ~s lost; retrying~%")
+                               (build-machine-name machine))
+                       (loop (retries-left)))
+                     (leave (G_ "connection repeatedly lost with machine 
'~a'~%")
+                            (build-machine-name machine))))))
+      (let* ((name    (build-machine-name machine))
+             (socket  (build-machine-daemon-socket machine))
+             (session (open-ssh-session machine %short-timeout))
+             (node    (remote-inferior session)))
+        (dynamic-wind
+          (lambda () #t)
+          (lambda ()
+            (assert-node-has-guix node name)
+            (assert-node-repl node name)
+            (assert-node-can-import session node name socket)
+            (assert-node-can-export session node name socket))
+          (lambda ()
+            (close-inferior node)
+            (disconnect! session)))))))
 
 (define (check-machine-status machine-file pred)
   "Print the load of each machine matching PRED in MACHINE-FILE."
@@ -824,7 +846,7 @@ machine."
                        ((file) (values file (const #t)))
                        (()     (values %machine-file (const #t)))
                        (x      (leave (G_ "wrong number of arguments~%"))))))
-         (check-machine-availability (or file %machine-file) pred))))
+         (check-machines-availability (or file %machine-file) pred))))
     (("status" rest ...)
      (with-error-handling
        (let-values (((file pred)
-- 
2.31.1






reply via email to

[Prev in Thread] Current Thread [Next in Thread]