From d8047ae86d5418782db7ec906c10e1af4f129997 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Sat, 12 Feb 2022 20:14:27 -0800
Subject: [PATCH] sort: fix several version-sort problems

This also affects ls -v in some corner cases.
Problems reported by Michael Debertol <https://bugs.gnu.org/49239>.
While looking into this, I spotted some more areas where the
code and documentation did not agree, or where the documentation
was unclear.  In some cases I changed the code; in others
the documentation.  I hope things are nailed down better now.
* doc/sort-version.texi: Distinguish more carefully between
characters and bytes.  Say that non-identical strings can
compare equal, since they now can.  Improve readability in
various ways.  Make it clearer that a suffix can be the
entire string.
* src/ls.c (cmp_version): Fall back on strcmp if filevercmp
reports equality, since filevercmp is no longer a total order.
* src/sort.c (keycompare): Use filenvercmp, to treat NULs correctly.
* tests/misc/ls-misc.pl (v_files):
Adjust test to match new behavior.
* tests/misc/sort-version.sh: Add tests for stability,
and for sorting with NUL bytes.
---
 NEWS                       |   6 ++
 doc/sort-version.texi      | 174 ++++++++++++++++++++-----------------
 src/ls.c                   |  16 ++--
 src/sort.c                 |   2 +-
 tests/misc/ls-misc.pl      |   8 +-
 tests/misc/sort-version.sh |  10 ++-
 6 files changed, 123 insertions(+), 93 deletions(-)

diff --git a/NEWS b/NEWS
index fcf31fe39..d58be8863 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,12 @@ GNU coreutils NEWS                                    -*- outline -*-
   'id xyz' now uses the name 'xyz' to determine groups, instead of xyz's uid.
   [bug introduced in coreutils-8.22]
 
+  'ls -v' and 'sort -V' no longer mishandle corner cases like "a..a" vs "a.+"
+  or lines containing NULs.  Their behavior now matches the documentation
+  for file names like ".m4" that consist entirely of an extension,
+  and the documentation has been clarified for unusual cases.
+  [bug introduced in coreutils-7.0]
+
   On macOS, 'mv A B' no longer fails with "Operation not supported"
   when A and B are in the same tmpfs file system.
   [bug introduced in coreutils-9.0]
diff --git a/doc/sort-version.texi b/doc/sort-version.texi
index 7f76ac5bb..aa7b0f9d5 100644
--- a/doc/sort-version.texi
+++ b/doc/sort-version.texi
@@ -161,33 +161,40 @@ The strings are compared from left to right.
 
 @item
 First the initial part of each string consisting entirely of non-digit
-characters is determined.
+bytes is determined.
 
-@enumerate
+@enumerate A
 @item
-These two parts (one of which may be empty) are compared lexically.
+These two parts (either of which may be empty) are compared lexically.
 If a difference is found it is returned.
 
 @item
-The lexical comparison is a comparison of ASCII values modified so that:
+The lexical comparison is a lexicographic comparison of byte strings,
+except that:
 
-@enumerate
+@enumerate a
 @item
-Letters sort before non-letters.
+ASCII letters sort before other bytes.
 @item
-A tilde sorts before anything, even the end of a part.
+A tilde sorts before anything, even an empty string.
 @end enumerate
 @end enumerate
 
 @item
-Then the initial part of the remainder of each string which consists
-entirely of digit characters is determined. The numerical values of
+Then the initial part of the remainder of each string that contains
+all the leading digits is determined. The numerical values represented by
 these two parts are compared, and any difference found is returned as
 the result of the comparison.
-@enumerate
+
+@enumerate A
 @item
 For these purposes an empty string (which can only occur at the end of
 one or both version strings being compared) counts as zero.
+
+@item
+Because the numerical value is used, non-identical strings can compare
+equal.  For example, @samp{123} compares equal to @samp{00123}, and
+the empty string compares equal to @samp{0}.
 @end enumerate
 
 @item
@@ -202,7 +209,7 @@ down to the following parts, and the parts compared respectively from
 each string:
 
 @example
-foo  @r{vs}  foo   @r{(rule 2, non-digit characters)}
+foo  @r{vs}  foo   @r{(rule 2, non-digits)}
 07   @r{vs}  7     @r{(rule 3, digits)}
 .    @r{vs}  a.    @r{(rule 2)}
 7    @r{vs}  7     @r{(rule 3)}
@@ -213,22 +220,22 @@ Comparison flow based on above algorithm:
 
 @enumerate
 @item
-The first parts (@samp{foo}) are identical in both strings.
+The first parts (@samp{foo}) are identical.
 
 @item
 The second parts (@samp{07} and @samp{7}) are compared numerically,
-and are identical.
+and compare equal.
 
 @item
 The third parts (@samp{.} vs @samp{a.}) are compared
-lexically by ASCII value (rule 2.2).
+lexically by ASCII value (rule 2.B).
 
 @item
-The first character of the first string (@samp{.}) is compared
-to the first character of the second string (@samp{a}).
+The first byte of the first string (@samp{.}) is compared
+to the first byte of the second string (@samp{a}).
 
 @item
-Rule 2.2.1 dictates that ``all letters sorts earlier than all non-letters''.
+Rule 2.B.a says letters sorts before non-letters.
 Hence, @samp{a} comes before @samp{.}.
 
 @item
@@ -280,17 +287,17 @@ $ sort -n input4                  $ sort -V input4
 Numeric sort (@samp{sort -n}) treats the entire string as a single numeric
 value, and compares it to other values. For example, @samp{8.1}, @samp{8.10} and
 @samp{8.100} are numerically equivalent, and are ordered together. Similarly,
-@samp{8.49} is numerically smaller than @samp{8.5}, and appears before first.
+@samp{8.49} is numerically less than @samp{8.5}, and appears before first.
 
 Version sort (@samp{sort -V}) first breaks down the string into digit and
 non-digit parts, and only then compares each part (see annotated
-example in Version-sort ordering rules).
+example in @ref{Version-sort ordering rules}).
 
 Comparing the string @samp{8.1} to @samp{8.01}, first the
-@samp{8} characters are compared (and are identical), then the
+@samp{8}s are compared (and are identical), then the
 dots (@samp{.}) are compared and are identical, and lastly the
 remaining digits are compared numerically (@samp{1} and @samp{01}) -
-which are numerically equivalent. Hence, @samp{8.01} and @samp{8.1}
+which are numerically equal.  Hence, @samp{8.01} and @samp{8.1}
 are grouped together.
 
 Similarly, comparing @samp{8.5} to @samp{8.49} -- the @samp{8}
@@ -301,10 +308,10 @@ This sorting order (where @samp{8.5} comes before @samp{8.49}) is common when
 assigning versions to computer programs (while perhaps not intuitive
 or ``natural'' for people).
 
-@node Punctuation characters
-@subsection Punctuation characters
+@node Version sort punctuation
+@subsection Version sort punctuation
 
-Punctuation characters are sorted by ASCII order (rule 2.2).
+Punctuation is sorted by ASCII order (rule 2.B).
 
 @example
 $ touch 1.0.5_src.tar.gz 1.0_src.tar.gz
@@ -319,22 +326,22 @@ Based on the version-sort ordering rules, the strings are broken down
 into the following parts:
 
 @example
-          1   @r{vs}  1               @r{(rule 3, all digit characters)}
-          .   @r{vs}  .               @r{(rule 2, all non-digit characters)}
+          1   @r{vs}  1               @r{(rule 3, all digits)}
+          .   @r{vs}  .               @r{(rule 2, all non-digits)}
           0   @r{vs}  0               @r{(rule 3)}
           .   @r{vs}  _src.tar.gz     @r{(rule 2)}
-          5   @r{vs}  empty string    @r{(no more character in the file name)}
+          5   @r{vs}  empty string    @r{(no more bytes in the file name)}
 _src.tar.gz   @r{vs}  empty string
 @end example
 
 The fourth parts (@samp{.} and @samp{_src.tar.gz}) are compared
-lexically by ASCII order. The character @samp{.} (ASCII value 46) is
-smaller than @samp{_} (ASCII value 95) -- and should be listed before it.
+lexically by ASCII order. The @samp{.} (ASCII value 46) is
+less than @samp{_} (ASCII value 95) -- and should be listed before it.
 
 Hence, @file{1.0.5_src.tar.gz} is listed first.
 
-If a different character appears instead of the underscore (for
-example, percent sign @samp{%} ASCII value 37, which is smaller
+If a different byte appears instead of the underscore (for
+example, percent sign @samp{%} ASCII value 37, which is less
 than dot's ASCII value of 46), that file will be listed first:
 
 @example
@@ -344,7 +351,7 @@ $ touch   1.0.5_src.tar.gz     1.0%zzzzz.gz
 @end example
 
 The same reasoning applies to the following example, as @samp{.} with
-ASCII value 46 is smaller than @samp{/} with ASCII value 47:
+ASCII value 46 is less than @samp{/} with ASCII value 47:
 
 @example
 $ cat input5
@@ -359,7 +366,7 @@ $ sort -V input5
 @node Punctuation vs letters
 @subsection Punctuation vs letters
 
-Rule 2.2.1 dictates that letters sorts earlier than all non-letters
+Rule 2.B.a says letters sort before non-letters
 (after breaking down a string to digit and non-digit parts).
 
 @example
@@ -372,23 +379,23 @@ a%
 @end example
 
 The input strings consist entirely of non-digits, and based on the
-above algorithm have only one part, all non-digit characters
+above algorithm have only one part, all non-digits
 (@samp{a%} vs @samp{az}).
 
 Each part is then compared lexically,
-character-by-character. @samp{a} compares identically in both
+byte-by-byte; @samp{a} compares identically in both
 strings.
 
-Rule 2.2.1 dictates that letters (@samp{z}) sorts earlier than all
-non-letters (@samp{%}) -- hence @samp{az} appears first (despite
+Rule 2.B.a says a letter like @samp{z} sorts before
+a non-letter like @samp{%} -- hence @samp{az} appears first (despite
 @samp{z} having ASCII value of 122, much larger than @samp{%}
 with ASCII value 37).
 
-@node The tilde @samp{~} character
-@subsection The tilde @samp{~} character
+@node The tilde @samp{~}
+@subsection The tilde @samp{~}
 
-Rule 2.2.2 dictates that the tilde character @samp{~} (ASCII 126) sorts
-before all other non-digit characters, including an empty part.
+Rule 2.B.b says the tilde @samp{~} (ASCII 126) sorts
+before other bytes, and before an empty string.
 
 @example
 $ cat input7
@@ -413,13 +420,13 @@ with a non-digit (@samp{~}). This is the first part. All other lines
 in the input file start with a digit -- their first non-digit part is
 empty.
 
-Based on rule 2.2.2, tilde @samp{~} sorts before all other non-digits
-including the empty part -- hence it comes before all other strings,
+Based on rule 2.B.b, tilde @samp{~} sorts before other bytes
+and before the empty string -- hence it comes before all other strings,
 and is listed first in the sorted output.
 
 The remaining lines (@samp{1}, @samp{1%}, @samp{1.2}, @samp{1~})
 follow similar logic: The digit part is extracted (1 for all strings)
-and compares identical. The following extracted parts for the remaining
+and compares equal. The following extracted parts for the remaining
 input lines are: empty part, @samp{%}, @samp{.}, @samp{~}.
 
 Tilde sorts before all others, hence the line @samp{1~} appears next.
@@ -452,8 +459,8 @@ aα
 Ignoring the first letter (@samp{a}) which is identical in all
 strings, the compared values are:
 
-@samp{a} and @samp{z} are letters, and sort earlier than
-all other non-digit characters.
+@samp{a} and @samp{z} are letters, and sort before
+all other non-digits.
 
 Then, percent sign @samp{%} (ASCII value 37) is compared to the
 first byte of the UTF-8 sequence of @samp{α}, which is 0xCE or 206). The
@@ -467,8 +474,8 @@ official Debian algorithm, in order to accommodate more general usage
 and file name listing.
 
 
-@node Hyphen-minus and colon characters
-@subsection Hyphen-minus @samp{-} and colon @samp{:} characters
+@node Hyphen-minus and colon
+@subsection Hyphen-minus @samp{-} and colon @samp{:}
 
 In Debian's version string syntax the version consists of three parts:
 @example
@@ -488,7 +495,7 @@ Example of such version strings:
 @end example
 
 If the @samp{debian_revision part} is not present,
-hyphen characters @samp{-} are not allowed.
+hyphens @samp{-} are not allowed.
 If epoch is not present, colons @samp{:} are not allowed.
 
 If these parts are present, hyphen and/or colons can appear only once
@@ -498,8 +505,8 @@ In GNU Coreutils, such restrictions are not reasonable (a file name can
 have many hyphens, a line of text can have many colons).
 
 As a result, in GNU Coreutils hyphens and colons are treated exactly
-like all other punctuation characters, i.e., they are sorted after
-letters.  @xref{Punctuation characters}.
+like all other punctuation, i.e., they are sorted after
+letters.  @xref{Version sort punctuation}.
 
 In Debian, these characters are treated differently than in Coreutils:
 a version string with hyphen will sort before similar strings without
@@ -522,29 +529,27 @@ out of order
 For further details, see @ref{Comparing two strings using Debian's
 algorithm} and @uref{https://bugs.gnu.org/35939,GNU Bug 35939}.
 
-@node Additional hard-coded priorities in GNU Coreutils version sort
-@subsection Additional hard-coded priorities in GNU Coreutils version sort
+@node Special priority in GNU Coreutils version sort
+@subsection Special priority in GNU Coreutils version sort
 
 In GNU Coreutils version sort, the following items have
-special priority and sort earlier than all other characters (listed in
-order);
+special priority and sort before all other strings (listed in order):
 
 @enumerate
 @item The empty string
 
-@item The string @samp{.} (a single dot character, ASCII 46)
+@item The string @samp{.} (a single dot, ASCII 46)
 
-@item The string @samp{..} (two dot characters)
+@item The string @samp{..} (two dots)
 
-@item Strings start with a dot (@samp{.}) sort earlier than
-strings starting with any other characters.
+@item Strings starting with dot (@samp{.}) sort before
+strings starting with any other byte.
 @end enumerate
 
 Example:
 
 @example
 $ printf '%s\n' a "" b "." c  ".."  ".d20" ".d3"  | sort -V
-
 .
 ..
 .d3
@@ -567,33 +572,35 @@ program, the ordering rules are the same.
 @subsection Special handling of file extensions
 
 GNU Coreutils version sort implements specialized handling
-of file extensions (or strings that look like file names with
-extensions).
-
-This nuanced implementation enables slightly more natural ordering of files.
+of strings that look like file names with extensions.
+This enables slightly more natural ordering of file
+names.
 
-The additional rules are:
+The following additional rules apply when comparing two strings where
+both begin with non-@samp{.}.  They also apply when comparing two
+strings where both begin with @samp{.} but neither is @samp{.} or @samp{..}.
 
 @enumerate
 @item
-A suffix (i.e., a file extension) is defined as: a dot, followed by a
-letter or tilde, followed by one or more letters, digits, or tildes
-(possibly repeated more than once), until the end of the string
-(technically, matching the regular expression
-@code{(\.[A-Za-z~][A-Za-z0-9~]*)*}).
+A suffix (i.e., a file extension) is defined as: a dot, followed by an
+ASCII letter or tilde, followed by zero or more ASCII letters, digits,
+or tildes; all repeated zero or more times, and ending at string end.
+This is equivalent to matching the extended regular expression
+@code{(\.[A-Za-z~][A-Za-z0-9~]*)*$} in the C locale.
 
 @item
-If the strings contains suffixes, the suffixes are temporarily
-removed, and the strings are compared without them (using the
-@ref{Version-sort ordering rules,algorithm,algorithm} above).
+The suffixes are temporarily removed, and the strings are compared
+without them, using version sort (see @ref{Version-sort ordering
+rules}) without special priority (see @ref{Special priority in GNU
+Coreutils version sort}).
 
 @item
-If the suffix-less strings are identical, the suffix is restored and
-the entire strings are compared.
+If the suffix-less strings do not compare equal, this comparison
+result is used and the suffixes are effectively ignored.
 
 @item
-If the non-suffixed strings differ, the result is returned and the
-suffix is effectively ignored.
+If the suffix-less strings compare equal, the suffixes are restored
+and the entire strings are compared using version sort.
 @end enumerate
 
 Examples for rule 1:
@@ -619,6 +626,9 @@ is not included)
 @item
 @samp{gcc-c++-10.8.12-0.7rc2.fc9.tar.bz2}: the suffix is
 @samp{.fc9.tar.bz2} (@samp{.7rc2} is not included as it begins with a digit)
+
+@item
+@samp{.autom4te.cfg}: the suffix is the entire string.
 @end itemize
 
 Examples for rule 2:
@@ -665,8 +675,8 @@ Without the suffix-removal algorithm, the strings will be broken down
 to the following parts:
 
 @example
-hello-  @r{vs}  hello-  @r{(rule 2, all non-digit characters)}
-8       @r{vs}  8       @r{(rule 3, all digit characters)}
+hello-  @r{vs}  hello-  @r{(rule 2, all non-digits)}
+8       @r{vs}  8       @r{(rule 3, all digits)}
 .txt    @r{vs}  .       @r{(rule 2)}
 empty   @r{vs}  2
 empty   @r{vs}  .txt
@@ -685,8 +695,8 @@ The suffixes (@samp{.txt}) are removed, and the remaining strings are
 broken down into the following parts:
 
 @example
-hello-  @r{vs}  hello-  @r{(rule 2, all non-digit characters)}
-8       @r{vs}  8       @r{(rule 3, all digit characters)}
+hello-  @r{vs}  hello-  @r{(rule 2, all non-digits)}
+8       @r{vs}  8       @r{(rule 3, all digits)}
 empty   @r{vs}  .       @r{(rule 2)}
 empty   @r{vs}  2
 @end example
@@ -754,7 +764,7 @@ dpkg: warning: version '3.0/' has bad syntax:
 
 To illustrate the different handling of hyphens between Debian and
 Coreutils algorithms (see
-@ref{Hyphen-minus and colon characters}):
+@ref{Hyphen-minus and colon}):
 
 @example
 $ compver abb ab-cd 2>/dev/null     $ printf 'abb\nab-cd\n' | sort -V
diff --git a/src/ls.c b/src/ls.c
index 53f80076b..1930e4abb 100644
--- a/src/ls.c
+++ b/src/ls.c
@@ -3941,18 +3941,20 @@ DEFINE_SORT_FUNCTIONS (extension, cmp_extension)
 DEFINE_SORT_FUNCTIONS (width, cmp_width)
 
 /* Compare file versions.
-   Unlike all other compare functions above, cmp_version depends only
-   on filevercmp, which does not fail (even for locale reasons), and does not
-   need a secondary sort key. See lib/filevercmp.h for function description.
+   Unlike the other compare functions, cmp_version does not fail
+   because filevercmp and strcmp do not fail; cmp_version uses strcmp
+   instead of xstrcoll because filevercmp is locale-independent so
+   strcmp is its appropriate secondary.
 
-   All the other sort options, in fact, need xstrcoll and strcmp variants,
-   because they all use a string comparison (either as the primary or secondary
+   All the other sort options need xstrcoll and strcmp variants,
+   because they all use xstrcoll (either as the primary or secondary
    sort key), and xstrcoll has the ability to do a longjmp if strcoll fails for
-   locale reasons.  Lastly, filevercmp is ALWAYS available with gnulib.  */
+   locale reasons.  */
 static int
 cmp_version (struct fileinfo const *a, struct fileinfo const *b)
 {
-  return filevercmp (a->name, b->name);
+  int diff = filevercmp (a->name, b->name);
+  return diff ? diff : strcmp (a->name, b->name);
 }
 
 static int
diff --git a/src/sort.c b/src/sort.c
index 1a3bda698..3b775d6bb 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -2703,7 +2703,7 @@ keycompare (struct line const *a, struct line const *b)
           else if (key->random)
             diff = compare_random (ta, tlena, tb, tlenb);
           else if (key->version)
-            diff = filevercmp (ta, tb);
+            diff = filenvercmp (ta, tlena, tb, tlenb);
           else
             {
               /* Locale-dependent string sorting.  This is slower than
diff --git a/tests/misc/ls-misc.pl b/tests/misc/ls-misc.pl
index e246b5c20..a2ac60b23 100755
--- a/tests/misc/ls-misc.pl
+++ b/tests/misc/ls-misc.pl
@@ -113,8 +113,12 @@ sub make_j_d ()
   chmod 0555, 'j/d' or die "making j/d executable: $!\n";
 }
 
-my @v1 = (qw(0 9 A Z a z), 'zz~', 'zz', 'zz.~1~', 'zz.0');
-my @v_files = ((map { ".$_" } @v1), @v1);
+my @v_files = (
+    '.A', '.Z', '.a', '.z', '.zz~', '.zz', '.zz.~1~',
+    '.0', '.9', '.zz.0',
+     '0',  '9',
+     'A',  'Z',  'a',  'z',  'zz~',  'zz',  'zz.~1~',
+     'zz.0');
 my $exe_in_subdir = {PRE => sub { make_j_d (); push_ls_colors('ex=01;32') }};
 my $remove_j = {POST => sub {unlink 'j/d' or die "j/d: $!\n";
                              rmdir 'j' or die "j: $!\n";
diff --git a/tests/misc/sort-version.sh b/tests/misc/sort-version.sh
index 3786605d6..10ea6a56c 100755
--- a/tests/misc/sort-version.sh
+++ b/tests/misc/sort-version.sh
@@ -58,6 +58,7 @@ string start 5.8.0 end of str
 string start 5.80.0 end of str
 string start 5.9.0 end of str
 string start 5.90.0 end of str
+string start 5.04.0 end of str
 _EOF_
 
 cat > exp << _EOF_
@@ -85,6 +86,7 @@ string start 5.1.0 end of str
 string start 5.2.0 end of str
 string start 5.3.0 end of str
 string start 5.4.0 end of str
+string start 5.04.0 end of str
 string start 5.5.0 end of str
 string start 5.6.0 end of str
 string start 5.7.0 end of str
@@ -101,6 +103,12 @@ string start 5.80.0 end of str
 string start 5.90.0 end of str
 _EOF_
 
-sort --sort=version -o out in || fail=1
+sort --stable --sort=version -o out in || fail=1
 compare exp out || fail=1
+
+tr ' ' '\0' <in >in0 || framework_failure_
+sort --stable --sort=version -o out0 in0 || fail=1
+tr '\0' ' ' <out0 >out1 || framework_failure_
+compare exp out1 || fail=1
+
 Exit $fail
-- 
2.32.0