From a7a456cd6fab22eab69303a4430edf5501187299 Mon Sep 17 00:00:00 2001 From: Maxim Cournoyer Date: Tue, 24 Aug 2021 00:20:50 -0400 Subject: base: Annotate output with job information. Before this change, it was difficult to discern which job emitted which output, as there was no information connecting the job to the output it produced. This change rectifies that by annotating each line output by cron/mcron with a prefix that contains a timestamp and the job name. It also reports about when the job runs and whether it completed successfully or failed. It was initially suggested here: . Thanks to the fine people from the #guile libera.chat IRC channel for providing ideas and help; this change would not have been possible without them! * src/mcron/base.scm (install-suspendable-ports!): Install suspendable ports. (%date-format, %log-format): New parameters. (validate-date-format, validate-log-format): New procedures. (): New record. (run-job): Update doc. Redirect stdout and stderr to a pipe. Return a instance containing the input port and other information about the job. Output job status messages. (process-output): New procedure. (child-cleanup): Add docstring. Use positive logic. Call 'process-output' one last time after a child process is collected. (run-job-loop): Add a CHILDREN-DATA variable to the loop. Provide the open file descriptors of the children ports to select*, and collect their output when they trigger select. * tests/base.scm ("run-job: basic"): Adjust and fix indentation. (dummy-job/capture-output): New procedure. ("run-job, output"): New test. ("validate-date-format, valid", "validate-date-format, invalid") ("validate-log-format, valid", "validate-log-format, invalid") ("run-job, output with custom format", "run-job, failure") ("run-job, failure in shell action"): New tests. * src/mcron/scripts/cron.scm (show-help): Document new options. (%options) [log-format, date-format]: New options. (main): Parameterize the main loop with the new parameter options (or their default values when not provided); move exception handling elsewhere (see below). * src/mcron/scripts/mcron.scm: Likewise. * src/cron.in: Install error handler here. * src/mcron.in: Likewise. * doc/mcron.texi: Document new cron and mcron options, as well as new (mcron base) APIs. * tests/basic.sh: Test the new options. Suggested-by: Robert Vollmert --- src/mcron/base.scm | 328 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 270 insertions(+), 58 deletions(-) (limited to 'src/mcron/base.scm') diff --git a/src/mcron/base.scm b/src/mcron/base.scm index 037a9b7..6e325f7 100644 --- a/src/mcron/base.scm +++ b/src/mcron/base.scm @@ -2,6 +2,7 @@ ;;; Copyright © 2003 Dale Mellor ;;; Copyright © 2015, 2016, 2018 Mathieu Lirzin ;;; Copyright © 2016, 2020 Ludovic Courtès +;;; Copyright © 2021 Maxim Cournoyer ;;; ;;; This file is part of GNU Mcron. ;;; @@ -26,18 +27,29 @@ ;;;; Code: (define-module (mcron base) + #:use-module (ice-9 format) #:use-module (ice-9 match) #:use-module (ice-9 control) + #:use-module (ice-9 rdelim) + #:use-module (ice-9 suspendable-ports) #:use-module (mcron environment) #:use-module (mcron utils) #:use-module (srfi srfi-1) #:use-module (srfi srfi-2) #:use-module (srfi srfi-9) + #:use-module ((srfi srfi-19) #:prefix srfi-19:) + #:use-module (srfi srfi-26) #:use-module (srfi srfi-111) #:export (add-job remove-user-jobs display-schedule run-job-loop + + %date-format + %log-format + validate-date-format + validate-log-format + ;; Deprecated and undocumented procedures. use-system-job-list use-user-job-list @@ -45,6 +57,8 @@ #:re-export (clear-environment-mods append-environment-mods)) +(install-suspendable-ports!) + ;; A cron job. (define-record-type (make-job user time-proc action environment displayable next-time) @@ -71,6 +85,48 @@ (user schedule-user set-schedule-user!) ;list of (current schedule-current set-schedule-current!)) ;symbol 'user or 'system +;; A (srfi srfi-19) format string for the date. It is used to format the +;; timestamp argument. Defaults to the local ISO-8601 date/time format. +(define %date-format (make-parameter "~5")) + +(define (validate-date-format fmt) + "Validate that FMT is a valid srfi-19 date format string." + (let ((time (srfi-19:current-time))) + (unless (false-if-exception + (srfi-19:date->string (srfi-19:time-utc->date time) fmt)) + (throw 'mcron-error 1 "invalid srfi-19 date format string +hint: consult 'info \"(guile) SRFI-19 Date to string\"'")))) + +;; An (ice-9 format) format string. The positional arguments applied to +;; format are: +;; 1. the timestamp; +;; 2. the job process PID; +;; 3. the action name; +;; 4. the message. +(define %log-format (make-parameter "~a ~2@*~a: ~a~%")) + +(define (validate-log-format fmt) + "Validate that FMT is a valid (ice-9 format) log format string." + (unless (with-output-to-port (%make-void-port "w") + (lambda () + (with-error-to-port (%make-void-port "w") + (lambda () + (false-if-exception + (format #f "~@?" fmt + "2021-08-17T15:23:12" 39143 "dummy" "message")))))) + (throw 'mcron-error 1 "invalid (ice-9 format) format string +hint: consult 'info \"(guile) Formatted Output\"'"))) + +;; Data about a running job process. +(define-record-type + (make-job-data pid port continuation name) + job-data? + (pid job-data:pid) ;string : the job process PID + (port job-data:port) ;port : an input + (continuation job-data:continuation ;a partial continuation to read port + set-job-data-continuation!) + (name job-data:name)) ;string : the name of the job action + (define %global-schedule ;; Global schedule used by 'mcron' and 'cron'. (make-schedule '() '() 'user)) @@ -172,50 +228,185 @@ unusable." ;; Apply PROC to the value stored in 'number-children'. (set-box! number-children (proc (unbox number-children)))) -(define (run-job job) - "Run JOB in a separate process. The process is run as JOB user with the +(define* (run-job job) + "Run JOB in a separate process. The process is run as JOB user with the environment properly set. Update the NEXT-TIME field of JOB by computing its -next value." - (if (= (primitive-fork) 0) - (dynamic-wind ;child - (const #t) - (λ () - (setgid (passwd:gid (job:user job))) - (setuid (passwd:uid (job:user job))) - ;; Handle the case where the home directory points to a nonexistent - ;; location, as can be the case when running the job as the "nobody" - ;; user. - (catch 'system-error - (lambda () - (chdir (passwd:dir (job:user job)))) - (lambda args - (let ((errno (system-error-errno args))) - (cond - ((= ENOENT errno) (chdir "/")) - (else (throw 'system-error args)))))) - (modify-environment (job:environment job) (job:user job)) - ((job:action job))) - (λ () - (primitive-exit 0))) - (begin ;parent - (update-number-children! 1+) - (job:next-time-set! job ((job:next-time-function job) - (current-time)))))) - -(define (child-cleanup) - ;; Give any zombie children a chance to die, and decrease the number known - ;; to exist. - (unless (or (<= (unbox number-children) 0) - (= (car (waitpid WAIT_ANY WNOHANG)) 0)) - (update-number-children! 1-) - (child-cleanup))) - -(define* (run-job-loop #:optional fd-list #:key (schedule %global-schedule)) - ;; Loop over all job specifications, get a list of the next ones to run (may - ;; be more than one). Set an alarm and go to sleep. When we wake, run the - ;; jobs and reap any children (old jobs) that have completed. Repeat ad - ;; infinitum. - ;; +next value. Return a record object containing the job process +PID, the input pipe from which the process standard output and standard error +streams can be read as well as the name of the job." + (define start (srfi-19:current-time srfi-19:time-monotonic)) ;start time + + (define (seconds-since start) + ;; Return the time elapsed in seconds since START. + (let* ((end (srfi-19:current-time srfi-19:time-monotonic)) + (elapsed (srfi-19:time-difference end start))) + (+ (srfi-19:time-second elapsed) + (* 1e-9 (srfi-19:time-nanosecond elapsed))))) + + ;; Create a pipe, and set its read side to non-blocking mode. + (define child->parent-pipe (pipe)) + (let ((flags (fcntl (car child->parent-pipe) F_GETFL))) + (fcntl (car child->parent-pipe) F_SETFL (logior O_NONBLOCK flags))) + + ;; Empty buffers to avoid duplicated output. + (flush-all-ports) + + (match (primitive-fork) + (0 ;child + ;; Prepare the environment. + + ;; Connect the stdout and stderr outputs of the child process to the + ;; pipe established in the parent. + (close (car child->parent-pipe)) ;unused input pipe + (dup2 (port->fdes (cdr child->parent-pipe)) 1) + (dup2 1 2) + (set-current-output-port (cdr child->parent-pipe)) + (set-current-error-port (cdr child->parent-pipe)) + + ;; Use line buffering so the output is printed in "real time". + (setvbuf (current-output-port) 'line) + (setvbuf (current-error-port) 'line) + + (setgid (passwd:gid (job:user job))) + (setuid (passwd:uid (job:user job))) + ;; Handle a nonexistent home directory, as can be the case when running + ;; the job as the "nobody" user. + (catch 'system-error + (lambda () + (chdir (passwd:dir (job:user job)))) + (lambda args + (let ((errno (system-error-errno args))) + (cond + ((= ENOENT errno) (chdir "/")) + (else (throw 'system-error args)))))) + (modify-environment (job:environment job) (job:user job)) + + ;; Execute the action. + (catch #t + (lambda () + (format #t "running...~%") + (flush-all-ports) + (let* ((result ((job:action job))) + (exit-val/maybe (false-if-exception + (status:exit-val result)))) + (when (and exit-val/maybe + (not (= 0 exit-val/maybe))) + (error "unclean exit status" exit-val/maybe))) + (format #t "completed in ~,3fs~%" (seconds-since start)) + (flush-all-ports) + (primitive-exit 0)) + (lambda args + (format (current-error-port) "failed after ~,3fs with: ~a~%" + (seconds-since start) args) + (flush-all-ports) + (primitive-exit 1)))) + (child-pid ;parent + (update-number-children! 1+) + (job:next-time-set! job ((job:next-time-function job) + (current-time))) + (close (cdr child->parent-pipe)) ;disconnect the write end of the pipe + (make-job-data + child-pid ;pid + (car child->parent-pipe) ;port + #f ;continuation for a suspended port + (job:displayable job))))) ;name + +(define* (process-output children-data) + "Read the child processes output from their input port recorded in +CHILDREN-DATA and print an annotated version of each line to the standard +output port. Ports are closed upon reading the EOF character. As a side +effect, save the partial continuation of any suspended port to their +associated instance." + (define timestamp (srfi-19:date->string + (srfi-19:time-monotonic->date + (srfi-19:current-time srfi-19:time-monotonic)) + (%date-format))) + + ;; Use line buffering so the output is printed in "real time". + (setvbuf (current-output-port) 'line) + + (define (log-data data) + ;; Print the lines as they become available. Do not block when a line + ;; could not be read. + (let ((name (job-data:name data)) + (pid (job-data:pid data)) + (port (job-data:port data))) + + (define (read-line*) + ;; Return, as a pair, the line and the terminated delimiter or end-of-file + ;; object. When a line cannot be read, return the '(suspended + ;; . partial-continuation) pair, where partial-continuation can be + ;; evaluated in the future when the port is ready to be read. + (call-with-prompt 'continue + (lambda () + (parameterize ((current-read-waiter + (lambda (_) + (abort-to-prompt 'continue)))) + (let ((cont (job-data:continuation data))) + (if cont + (begin + (set-job-data-continuation! data #f) ;reset continuation + (cont)) + ;; Also use the carriage return as a line delimiter to + ;; preserve the full output in a readable way. + (read-delimited "\n\r" port 'split))))) + (lambda (partial-continuation) + (cons 'suspended partial-continuation)))) + + (define (format-line line) + (format #t "~@?" (%log-format) timestamp pid name line)) + + (let loop ((line+delim (read-line*))) + (match line+delim + (('suspended . partial-continuation) + (set-job-data-continuation! data partial-continuation)) + ((line . (? eof-object?)) + (close port) + (unless (eof-object? line) + (format-line line))) + (("" . #\cr) + ;; A carriage return directly followed a delimiter. Ignore it. + (loop (read-line*))) + ((line . _) + (format-line line) + (loop (read-line*))))))) + + (for-each log-data + (remove (compose port-closed? job-data:port) + children-data))) + +(define (child-cleanup children-data) + "Give any zombie children a chance to die, and decrease the number known to +exist. CHILDREN-DATA is a list of objects. Return the pruned list +of CHILDREN-DATA." + (define has-children? (> (unbox number-children) 0)) + (define collected-pid (or (and has-children? + (car (waitpid WAIT_ANY WNOHANG))) + 0)) + (define (match-collected-pid? job-data) + (= (job-data:pid job-data) collected-pid)) + + (if (and has-children? + (not (= 0 collected-pid))) ;at least one process was collected + (begin + (update-number-children! 1-) + ;; Fully collect the output of the reaped child process. + (and=> (find match-collected-pid? children-data) + (lambda (child-data) + (process-output (list child-data)))) + (child-cleanup (remove match-collected-pid? children-data))) + children-data)) + +(define* (run-job-loop #:optional (fd-list '()) + #:key (schedule %global-schedule)) + "Loop over all job specifications in SCHEDULE, get a list of the next ones +to run (may be more than one). Set an alarm and go to sleep. When we wake, +run the jobs and reap any children (old jobs) that have completed. Repeat ad +infinitum." + ;; Validate the format parameters, so that we can fail early. + (validate-date-format (%date-format)) + (validate-log-format (%log-format)) + ;; Note that, if we wake ahead of time, it can only mean that a signal has ;; been sent by a crontab job to tell us to re-read a crontab file. In this ;; case we break out of the loop here, and let the main procedure deal with @@ -236,23 +427,44 @@ next value." (apply throw args))))))) (let/ec break - (let loop () + (let loop ((children-data '())) ;list of objects (match (find-next-jobs #:schedule schedule) ((next-time . next-jobs-lst) - (let ((sleep-time (if next-time - (- next-time (current-time)) - 2000000000))) + (let* ((sleep-time (if next-time + (- next-time (current-time)) + 2000000000)) + (ports (map job-data:port children-data)) + ;; Ensure closed ports are not put back into select, otherwise + ;; it would not block and EOFs would be read infinitely. + (children-fdes (filter-map (lambda (p) + (and (not (port-closed? p)) + (port->fdes p))) + ports))) (when (> sleep-time 0) - (match (select* fd-list '() '() sleep-time) + (match (select* (append fd-list children-fdes) + '() '() sleep-time) ((() () ()) ;; 'select' returned an empty set, perhaps because it got - ;; EINTR or EAGAIN. It's a good time to wait for child - ;; processes. - (child-cleanup)) - (((lst ...) () ()) - ;; There's some activity so leave the loop. - (break)))) - - (for-each run-job next-jobs-lst) - (child-cleanup) - (loop))))))) + ;; EINTR or EAGAIN. + (loop (child-cleanup children-data))) + (((fdes ...) () ()) + ;; Process any children input fdes ready to be read. + (let ((children-fdes/read-ready (lset-intersection + = children-fdes fdes))) + (when (not (null? (lset-difference + = fdes children-fdes/read-ready))) + ;; There was some crontab activity so leave the loop to + ;; process the update request. + (break)) + (unless (null? children-fdes/read-ready) + (process-output + (filter (lambda (x) + (member (port->fdes (job-data:port x)) + children-fdes/read-ready)) + children-data)) + (loop (child-cleanup children-data))))))) + + ;; The timeout has elapsed. Run the scheduled job(s). + (let ((new-children-data (map run-job next-jobs-lst)) + (pruned-children-data (child-cleanup children-data))) + (loop (append new-children-data pruned-children-data))))))))) -- cgit v1.2.3