bug-gnu-emacs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#52209: 28.0.60; [PATCH] date-to-time fails on pure dates


From: Bob Rogers
Subject: bug#52209: 28.0.60; [PATCH] date-to-time fails on pure dates
Date: Thu, 23 Dec 2021 14:48:04 -0500

   From: Lars Ingebrigtsen <larsi@gnus.org>
   Date: Tue, 21 Dec 2021 12:01:07 +0100

   Bob Rogers <rogers-emacs@rgrjr.homedns.org> writes:

   >    On the other hand, I can imagine the caller might want to insist
   > that the passed string must be in a certain format and force an
   > error if parse-time finds otherwise.

   Yup.  That's one good reason to not have a time parsing function guess
   at formats, because the input data will be different.

OK, I have proceeded along those lines; WIP attached for feedback.  I
changed the name to "parse-date" to avoid confusion; I was otherwise
stuck when trying to come up with a sensible name for the test file,
since parse-time-tests.el was already taken (though I suppose I could
have added to the existing file).  The docstring of parse-date describes
the expected functionality as far as I've planned, with comments in
square brackets to note what's missing.

   In my previous job, we had a library to parse date/time strings, and I
   think we were up to about 80 distinct formats to handle the different
   data feeds we were getting.  For instance, "01 02 03" may be three
   different dates depending on where you get the date from.

Which (additional) formats would you like?  I'm assuming we need iso8601
and rfc822 for compatibility (in which case rfc2822 will be easy to
provide in addition), and us-date and euro-date to disambiguate the
month/day order.  Would the third format correspond to ISO 2001-01-03?
Do we want to support that?

   And come to think of it, I've been using DD-Mon-YY for my own
purposes for so long that I'm not even certain whether Americans use
MM-DD-YY or it's the other way around . . .

                                        -- Bob

diff --git a/lisp/calendar/parse-date.el b/lisp/calendar/parse-date.el
new file mode 100644
index 0000000000..c4b756cf2e
--- /dev/null
+++ b/lisp/calendar/parse-date.el
@@ -0,0 +1,281 @@
+;;; parse-date.el --- parsing time/date strings -*- lexical-binding: t -*-
+
+;; Copyright (C) 2021 Free Software Foundation, Inc.
+
+;; Author: Bob Rogers <rogers@rgrjr.com>
+;; Keywords: util
+
+;; This file is part of GNU Emacs.
+
+;; GNU Emacs is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; GNU Emacs is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;; 'parse-date' parses a time and/or date in a string and returns a
+;; list of values, just like `decode-time', where unspecified elements
+;; in the string are returned as nil (except unspecified DST is
+;; returned as -1).  `encode-time' may be applied on these values to
+;; obtain an internal time value.  If left to its own devices, it
+;; accepts a wide variety of formats, but can be told to insist on a
+;; particular date/time format.
+
+;; Historically, `parse-time-string' was used for this purpose, but it
+;; was focused on email date formats, and gradually but imperfectly
+;; extended to handle other formats.  'parse-date' is compatible in
+;; that it parses the same input formats and uses the same return
+;; value format, but is stricter in that it signals an error for
+;; tokens that `parse-time-string' would simply ignore.
+
+;;; TODO:
+;;
+;; * Define and signal a date-error for parsing issues.
+;;
+;; * Implement rfc2822 and rfc822 independently of parse-time-string.
+;;
+;; * Add a euro-date format for DD/MM/YYYY ?
+;;
+
+;;; Code:
+
+(require 'cl-lib)
+(require 'iso8601)
+(require 'parse-time)
+
+(defun parse-date-guess-format (time-string)
+  (cond ((string-match "[0-9]T[0-9]" time-string) 'iso8601)
+        (t nil)))
+
+(defun parse-date-ignore-char? (char)
+  (or (eq char ?\ ) (eq char ?,) (eq char ?,)))
+
+(defun parse-date-tokenize-string (string)
+  "Turn STRING into tokens, separated only by whitespace and commas.
+Multiple commas are ignored.  Pure digit sequences are turned
+into integers."
+  (let ((index 0)
+       (end (length string))
+        (char nil)
+       (list ()))
+    ;; Skip leading ignored characters.
+    (while (and (< index end)
+                (setq char (aref string index))
+                (parse-date-ignore-char? char))
+      (cl-incf index))
+    (while (< index end)
+      (let ((start index)
+            (all-digits (<= ?0 char ?9)))
+        ;; char is valid; look for more valid characters.
+        (while (and (< (cl-incf index) end)
+                    (setq char (aref string index))
+                    (not (parse-date-ignore-char? char)))
+          (unless (<= ?0 char ?9)
+           (setq all-digits nil)))
+        (when (<= index end)
+         (push (if all-digits
+                    (cl-parse-integer string :start start :end index)
+                 (substring string start index))
+               list)
+          ;; Skip ignored characters.
+          (while (and (< (cl-incf index) end)
+                      (setq char (aref string index))
+                      (parse-date-ignore-char? char))
+            ())
+          ;; Next token.
+          )))
+    (nreverse list)))
+
+(defconst parse-date-slot-names
+  '(second minute hour day month year weekday dst zone)
+  "Names of return value slots, for better error messages
+See the decoded-time defstruct.")
+
+(defconst parse-date-slot-ranges
+  '((0 59) (0 59) (0 23) (1 31) (1 12) (1 9999))
+  "Numeric slot ranges, for bounds checking.")
+
+(defun parse-date-default (time-string two-digit-year?)
+  ;; Do the standard parsing thing.  This is mostly free form, in that
+  ;; tokens may appear in any order, but we expect to introduce some
+  ;; state dependence.
+  (let ((tokens (parse-date-tokenize-string (downcase time-string)))
+        (time (list nil nil nil nil nil nil nil -1 nil)))
+    (cl-flet ((set-matched-slot (slot index token)
+                ;; Assign a slot value from match data if index is
+                ;; non-nil, else from token, signalling an error if
+                ;; it's already been assigned or is out of range.
+                (let ((value (if index
+                                 (cl-parse-integer (match-string index token))
+                               token))
+                      (range (nth slot parse-date-slot-ranges)))
+                  (unless (equal (nth slot time)
+                                 (if (= slot 7) -1 nil))
+                    (error "Duplicate %s slot value '%s'"
+                           (nth slot parse-date-slot-names) token))
+                  (when (and range
+                             (not (<= (car range) value (cadr range))))
+                    (error "Value %s is out of range for %s"
+                           token (nth slot parse-date-slot-names)))
+                  (setf (nth slot time) value))))
+      (while tokens
+        (let ((token (pop tokens))
+              (match nil))
+          (cond ((numberp token)
+                  ;; A bare number could be a month, day, or year.
+                  ;; The order of these tests matters greatly.
+                  (cond ((>= token 1000)
+                          (set-matched-slot 5 nil token))
+                        ((and (<= 1 token 31)
+                              (not (nth 3 time)))
+                          ;; Assume days come before months or years.
+                          (set-matched-slot 3 nil token))
+                        ((and (<= 1 token 12)
+                              (not (nth 4 time)))
+                          ;; Assume days come before years.
+                          (set-matched-slot 4 nil token))
+                        ((or (nth 5 time)
+                             (not two-digit-year?)
+                             (> token 100))
+                          (error "Unrecognized numeric value %s" token))
+                        ;; It's a two-digit year.
+                        ((>= token 50)
+                          ;; second half of the 20th century.
+                          (set-matched-slot 5 nil (+ 1900 token)))
+                        (t
+                          ;; first half of the 21st century.
+                          (set-matched-slot 5 nil (+ 2000 token)))))
+                ((setq match (assoc token parse-time-weekdays))
+                  (set-matched-slot 6 nil (cdr match)))
+                ((setq match (assoc token parse-time-months))
+                  (set-matched-slot 4 nil (cdr match)))
+                ((setq match (assoc token parse-time-zoneinfo))
+                  (set-matched-slot 8 nil (cadr match))
+                  (set-matched-slot 7 nil (caddr match)))
+                ((string-match "^[-+][0-9][0-9][0-9][0-9]$" token)
+                  ;; Numeric time zone.
+                  (set-matched-slot
+                    8 nil
+                    (* 60
+                       (+ (cl-parse-integer token :start 3 :end 5)
+                          (* 60 (cl-parse-integer token :start 1 :end 3)))
+                       (if (= (aref token 0) ?-) -1 1))))
+                ((string-match
+                  
"^\\([0-9][0-9][0-9][0-9]\\)[-/]\\([0-9][0-9]?\\)[-/]\\([0-9][0-9]?\\)$"
+                  token)
+                  ;; ISO-8601-style date (YYYY-MM-DD).
+                  (set-matched-slot 5 1 token)
+                  (set-matched-slot 4 2 token)
+                  (set-matched-slot 3 3 token))
+                ((string-match
+                  
"^\\([0-9][0-9]?\\)[-/]\\([0-9][0-9]?\\)[-/]\\([0-9][0-9][0-9][0-9]\\)$"
+                  token)
+                  ;; US date (MM-DD-YYYY), but we insist on four
+                  ;; digits for the year.
+                  (set-matched-slot 4 1 token)
+                  (set-matched-slot 3 2 token)
+                  (set-matched-slot 5 3 token))
+                ((string-match
+                  "^\\([0-9][0-9]?\\):\\([0-9][0-9]\\):\\([0-9][0-9]\\)$"
+                  token)
+                  (set-matched-slot 2 1 token)
+                  (set-matched-slot 1 2 token)
+                  (set-matched-slot 0 3 token))
+                ((string-match "^\\([0-9][0-9]?\\):\\([0-9][0-9]\\)$" token)
+                  ;; Time without seconds.
+                  (set-matched-slot 2 1 token)
+                  (set-matched-slot 1 2 token)
+                  (set-matched-slot 0 nil 0))
+                ((member token '("am" "pm"))
+                  (unless (nth 2 time)
+                    (error "'AM'/'PM' specified before or without time"))
+                  (unless (<= (nth 2 time) 12)
+                    (error "'AM'/'PM' specified for time already past noon"))
+                  (when (equal token "pm")
+                    (cl-incf (nth 2 time) 12)))
+                (t
+                  (error "Unrecognized time token '%s'" token))))))
+    time))
+
+;;;###autoload
+(defun parse-date (time-string &optional format)
+  "Parse TIME-STRING according to FORMAT, returning a list.
+The FORMAT value is a symbol that may be one of the following:
+
+   iso8601 => parse the string according to the ISO-8601
+standard.  See `parse-iso8601-time-string'.
+
+   iso-8601 => synonym for iso8601.
+
+   rfc822 => parse an RFC822 (old email) date, which allows
+two-digit years and internal '()' comments.  In dates of the form
+'11 Jan 12', the 11 is assumed to be the day, and the 12 is
+assumed to mean 2012.  [not fully implemented.]
+
+   rfc2822 => parse an RFC2822 (new email) date, which allows
+only four-digit years.  [not implemented.]
+
+   us-date => parse a US-style date, of the form MM/DD/YYYY, but
+allowing two-digit years.  In dates of the form '01/11/12', the 1
+is the month, 11 is the day, and the 12 is assumed to mean 2012.
+[not fully implemented.]
+
+   nil => attempt to guess the format, falling back on us-date
+with two-digit years disallowed.
+
+The default is nil, and anything else is assumed to be us-date
+with two-digit years disallowed.
+
+   * For all formats except iso8601, parsing is case-insensitive.
+
+   * Commas and whitespace are ignored.
+
+   * In date specifications, either '/' or '-' may be used to
+separate components, but all three components must be given.
+
+   * A date that starts with four digits is YYYY-MM-DD, ISO-8601
+style, but a date that ends with four digits is MM-DD-YYYY [at
+least in us-date format].
+
+   * Two digit years, when allowed, are in the 1900's when
+between 50 and 99 and in the 2000's when between 0 and 49.
+
+Errors are signalled when time values are duplicated,
+unrecognized, or out of range.  No consistency checks between
+fields are done.  For instance, the weekday is not checked to see
+that it corresponds to the date, and parse-date complains about
+the 32nd of March (or any other month) but blithely accepts the
+29th of February in non-leap years -- or the 31st of February in
+any year.
+
+The result is a list of (SEC MIN HOUR DAY MON YEAR DOW DST TZ),
+which can be accessed as a decoded-time defstruct (q.v.),
+e.g. `decoded-time-year' to extract the year, and turned into an
+Emacs timestamp by `encode-time'.  The values returned are
+identical to those of `decode-time', but any unknown values other
+than DST are returned as nil, and an unknown DST value is
+returned as -1."
+  (cl-case (or format (parse-date-guess-format time-string))
+    ((iso8601 iso-8601)
+      (parse-iso8601-time-string time-string))
+    ((rfc822 rfc2822)
+      ;; [Placeholder; we eventually want something more strict.  --
+      ;; rgr, 20-Dec-21.]
+      (parse-time-string time-string))
+    (us-date
+      (parse-date-default time-string t))
+    (t
+      (parse-date-default time-string nil))))
+
+(provide 'parse-date)
+
+;;; parse-date.el ends here
diff --git a/test/lisp/calendar/parse-date-tests.el 
b/test/lisp/calendar/parse-date-tests.el
new file mode 100644
index 0000000000..682365e674
--- /dev/null
+++ b/test/lisp/calendar/parse-date-tests.el
@@ -0,0 +1,164 @@
+;;; parse-date-tests.el --- Test suite for parse-date.el  -*- 
lexical-binding:t -*-
+
+;; Copyright (C) 2016-2021 Free Software Foundation, Inc.
+
+;; Author: Lars Ingebrigtsen <larsi@gnus.org>
+
+;; This file is part of GNU Emacs.
+
+;; GNU Emacs is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; GNU Emacs is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;;; Code:
+
+(require 'ert)
+(require 'parse-date)
+
+(ert-deftest parse-date-tests ()
+  "Test basic parse-date functionality."
+
+  ;; Test tokenization.
+  (should (equal (parse-date-tokenize-string " ") '()))
+  (should (equal (parse-date-tokenize-string " a b") '("a" "b")))
+  (should (equal (parse-date-tokenize-string "a bbc dde") '("a" "bbc" "dde")))
+  (should (equal (parse-date-tokenize-string " , a 27 b,, c 14:32 ")
+                 '("a" 27 "b" "c" "14:32")))
+
+  ;; Start with some RFC822 dates.
+  (dolist (format '(nil rfc822))
+    (should (equal (parse-date "Mon, 22 Feb 2016 19:35:42 +0100" format)
+                   '(42 35 19 22 2 2016 1 -1 3600)))
+    (should (equal (parse-date "22 Feb 2016 19:35:42 +0100" format)
+                   '(42 35 19 22 2 2016 nil -1 3600)))
+    (should (equal (parse-date "22 Feb 2016 +0100" format)
+                   '(nil nil nil 22 2 2016 nil -1 3600)))
+    (should (equal (parse-date "Mon, 22 February 2016 19:35:42 +0100" format)
+                   '(42 35 19 22 2 2016 1 -1 3600)))
+    (should (equal (parse-date "Mon, 22 feb 2016 19:35:42 +0100" format)
+                   '(42 35 19 22 2 2016 1 -1 3600)))
+    (should (equal (parse-date "Monday, 22 february 2016 19:35:42 +0100" 
format)
+                   '(42 35 19 22 2 2016 1 -1 3600)))
+    (should (equal (parse-date "Monday, 22 february 2016 19:35:42 PST" format)
+                   '(42 35 19 22 2 2016 1 nil -28800)))
+    (should (equal (parse-date "Friday, 21 Sep 2018 13:47:58 PDT" format)
+                   '(58 47 13 21 9 2018 5 t -25200)))
+    (should (equal (parse-date "Friday, 21 Sep 2018 13:47:58" format)
+                   '(58 47 13 21 9 2018 5 -1 nil)))
+    (should (equal (parse-date "Friday, 21 Sep 2018" format)
+                   '(nil nil nil 21 9 2018 5 -1 nil))))
+  ;; These are not allowed by the default format.
+  (should (equal (parse-date "22 Feb 16 19:35:42 +0100" 'rfc822)
+                 '(42 35 19 22 2 2016 nil -1 3600)))
+  (should (equal (parse-date "22 Feb 96 19:35:42 +0100" 'rfc822)
+                 '(42 35 19 22 2 1996 nil -1 3600)))
+
+  ;; Test the default format with both hyphens and slashes in dates.
+  (dolist (case '(;; Month can be numeric if date uses hyphens/slashes.
+                  ("Friday, 2018-09-21" (nil nil nil 21 9 2018 5 -1 nil))
+                  ;; Year can come last if four digits.
+                  ("Friday, 9-21-2018" (nil nil nil 21 9 2018 5 -1 nil))
+                  ;; Day of week is optional
+                  ("2018-09-21" (nil nil nil 21 9 2018 nil -1 nil))
+                  ;; The order of date, time, etc., does not matter.
+                  ("13:47:58, +0100, 2018-09-21, Friday"
+                   (58 47 13 21 9 2018 5 -1 3600))
+                  ;; Month, day, or both, can be a single digit.
+                  ("Friday, 2018-9-08" (nil nil nil 8 9 2018 5 -1 nil))
+                  ("Friday, 2018-09-8" (nil nil nil 8 9 2018 5 -1 nil))
+                  ("Friday, 2018-9-8" (nil nil nil 8 9 2018 5 -1 nil))))
+    (let ((string (car case))
+          (expected (cadr case)))
+      ;; Test with hyphens.
+      (should (equal (parse-date string) expected))
+      (while (string-match "-" string)
+        (setq string (replace-match "/" t t string)))
+      ;; Test with slashes.
+      (should (equal (parse-date string) expected))))
+
+  ;; Time by itself is recognized as such.
+  (should (equal (parse-date "03:47:58")
+                 '(58 47 3 nil nil nil nil -1 nil)))
+  ;; A leading zero for hours is optional.
+  (should (equal (parse-date "3:47:58")
+                 '(58 47 3 nil nil nil nil -1 nil)))
+  ;; Missing seconds are assumed to be zero.
+  (should (equal (parse-date "3:47")
+                 '(0 47 3 nil nil nil nil -1 nil)))
+  ;; AM/PM are understood (in any case combination).
+  (dolist (am '(am AM Am))
+    (should (equal (parse-date (format "3:47 %s" am))
+                   '(0 47 3 nil nil nil nil -1 nil))))
+  (dolist (pm '(pm PM Pm))
+    (should (equal (parse-date (format "3:47 %s" pm))
+                   '(0 47 15 nil nil nil nil -1 nil))))
+
+  ;; Ensure some cases fail.
+  (should-error (parse-date "22 Feb 196" 'us-date))    ;; bad year
+  (should-error (parse-date "22 Feb 16 19:35:42"))     ;; two-digit year
+  (should-error (parse-date "22 Feb 96 19:35:42"))     ;; two-digit year
+  (should-error (parse-date "2 Feb 2021 1996"))        ;; duplicate year
+  (should-error (parse-date "2020-1-1 2021"))  ;; another duplicate year
+  (should-error (parse-date "2020-1-1 30"))    ;; extra 30 (not a day))
+  (should-error (parse-date "2020-1-1 12"))    ;; extra 12 (not a month)
+  (should-error (parse-date "15:47 15:15"))    ;; duplicate time
+  (should-error (parse-date "2020-1-1 +0800 -0800"))   ;; duplicate TZ
+  (should-error (parse-date "15:47 PM"))       ;; PM in the afternoon
+  (should-error (parse-date "2020-1-1 PM"))    ;; PM without a time
+  ;; Range tests.
+  (should-error (parse-date "2021-12-32"))
+  (should-error (parse-date "2021-12-0"))
+  (should-error (parse-date "2021-13-3"))
+  (should-error (parse-date "0000-12-3"))
+  (should-error (parse-date "20021 Dec 3"))
+  (should-error (parse-date "24:21:14"))
+  (should-error (parse-date "14:60:21"))
+  (should-error (parse-date "14:21:60"))
+
+  ;; Test ISO-8601 dates.
+  (dolist (format '(nil iso8601 iso-8601))
+    (should (equal (format-time-string
+                    "%Y-%m-%d %H:%M:%S"
+                    (parse-date "1998-09-12T12:21:54-0200" format) t)
+                   "1998-09-12 14:21:54"))
+    (should (equal (format-time-string
+                    "%Y-%m-%d %H:%M:%S"
+                    (parse-date "1998-09-12T12:21:54-0230" format) t)
+                   "1998-09-12 14:51:54"))
+    (should (equal (format-time-string
+                    "%Y-%m-%d %H:%M:%S"
+                    (parse-date "1998-09-12T12:21:54-02:00" format) t)
+                   "1998-09-12 14:21:54"))
+    (should (equal (format-time-string
+                    "%Y-%m-%d %H:%M:%S"
+                    (parse-date "1998-09-12T12:21:54-02" format) t)
+                   "1998-09-12 14:21:54"))
+    (should (equal (format-time-string
+                    "%Y-%m-%d %H:%M:%S"
+                    (parse-date "1998-09-12T12:21:54+0230" format) t)
+                   "1998-09-12 09:51:54"))
+    (should (equal (format-time-string
+                    "%Y-%m-%d %H:%M:%S"
+                    (parse-date "1998-09-12T12:21:54+02" format) t)
+                   "1998-09-12 10:21:54"))
+    (should (equal (format-time-string
+                    "%Y-%m-%d %H:%M:%S"
+                    (parse-date "1998-09-12T12:21:54Z" format) t)
+                   "1998-09-12 12:21:54"))
+    (should (equal (parse-date "1998-09-12T12:21:54")
+                   (encode-time 54 21 12 12 9 1998)))))
+
+(provide 'parse-date-tests)
+
+;;; parse-date-tests.el ends here

reply via email to

[Prev in Thread] Current Thread [Next in Thread]