From d8047ae86d5418782db7ec906c10e1af4f129997 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sat, 12 Feb 2022 20:14:27 -0800 Subject: [PATCH] sort: fix several version-sort problems This also affects ls -v in some corner cases. Problems reported by Michael Debertol . While looking into this, I spotted some more areas where the code and documentation did not agree, or where the documentation was unclear. In some cases I changed the code; in others the documentation. I hope things are nailed down better now. * doc/sort-version.texi: Distinguish more carefully between characters and bytes. Say that non-identical strings can compare equal, since they now can. Improve readability in various ways. Make it clearer that a suffix can be the entire string. * src/ls.c (cmp_version): Fall back on strcmp if filevercmp reports equality, since filevercmp is no longer a total order. * src/sort.c (keycompare): Use filenvercmp, to treat NULs correctly. * tests/misc/ls-misc.pl (v_files): Adjust test to match new behavior. * tests/misc/sort-version.sh: Add tests for stability, and for sorting with NUL bytes. --- NEWS | 6 ++ doc/sort-version.texi | 174 ++++++++++++++++++++----------------- src/ls.c | 16 ++-- src/sort.c | 2 +- tests/misc/ls-misc.pl | 8 +- tests/misc/sort-version.sh | 10 ++- 6 files changed, 123 insertions(+), 93 deletions(-) diff --git a/NEWS b/NEWS index fcf31fe39..d58be8863 100644 --- a/NEWS +++ b/NEWS @@ -24,6 +24,12 @@ GNU coreutils NEWS -*- outline -*- 'id xyz' now uses the name 'xyz' to determine groups, instead of xyz's uid. [bug introduced in coreutils-8.22] + 'ls -v' and 'sort -V' no longer mishandle corner cases like "a..a" vs "a.+" + or lines containing NULs. Their behavior now matches the documentation + for file names like ".m4" that consist entirely of an extension, + and the documentation has been clarified for unusual cases. + [bug introduced in coreutils-7.0] + On macOS, 'mv A B' no longer fails with "Operation not supported" when A and B are in the same tmpfs file system. [bug introduced in coreutils-9.0] diff --git a/doc/sort-version.texi b/doc/sort-version.texi index 7f76ac5bb..aa7b0f9d5 100644 --- a/doc/sort-version.texi +++ b/doc/sort-version.texi @@ -161,33 +161,40 @@ The strings are compared from left to right. @item First the initial part of each string consisting entirely of non-digit -characters is determined. +bytes is determined. -@enumerate +@enumerate A @item -These two parts (one of which may be empty) are compared lexically. +These two parts (either of which may be empty) are compared lexically. If a difference is found it is returned. @item -The lexical comparison is a comparison of ASCII values modified so that: +The lexical comparison is a lexicographic comparison of byte strings, +except that: -@enumerate +@enumerate a @item -Letters sort before non-letters. +ASCII letters sort before other bytes. @item -A tilde sorts before anything, even the end of a part. +A tilde sorts before anything, even an empty string. @end enumerate @end enumerate @item -Then the initial part of the remainder of each string which consists -entirely of digit characters is determined. The numerical values of +Then the initial part of the remainder of each string that contains +all the leading digits is determined. The numerical values represented by these two parts are compared, and any difference found is returned as the result of the comparison. -@enumerate + +@enumerate A @item For these purposes an empty string (which can only occur at the end of one or both version strings being compared) counts as zero. + +@item +Because the numerical value is used, non-identical strings can compare +equal. For example, @samp{123} compares equal to @samp{00123}, and +the empty string compares equal to @samp{0}. @end enumerate @item @@ -202,7 +209,7 @@ down to the following parts, and the parts compared respectively from each string: @example -foo @r{vs} foo @r{(rule 2, non-digit characters)} +foo @r{vs} foo @r{(rule 2, non-digits)} 07 @r{vs} 7 @r{(rule 3, digits)} . @r{vs} a. @r{(rule 2)} 7 @r{vs} 7 @r{(rule 3)} @@ -213,22 +220,22 @@ Comparison flow based on above algorithm: @enumerate @item -The first parts (@samp{foo}) are identical in both strings. +The first parts (@samp{foo}) are identical. @item The second parts (@samp{07} and @samp{7}) are compared numerically, -and are identical. +and compare equal. @item The third parts (@samp{.} vs @samp{a.}) are compared -lexically by ASCII value (rule 2.2). +lexically by ASCII value (rule 2.B). @item -The first character of the first string (@samp{.}) is compared -to the first character of the second string (@samp{a}). +The first byte of the first string (@samp{.}) is compared +to the first byte of the second string (@samp{a}). @item -Rule 2.2.1 dictates that ``all letters sorts earlier than all non-letters''. +Rule 2.B.a says letters sorts before non-letters. Hence, @samp{a} comes before @samp{.}. @item @@ -280,17 +287,17 @@ $ sort -n input4 $ sort -V input4 Numeric sort (@samp{sort -n}) treats the entire string as a single numeric value, and compares it to other values. For example, @samp{8.1}, @samp{8.10} and @samp{8.100} are numerically equivalent, and are ordered together. Similarly, -@samp{8.49} is numerically smaller than @samp{8.5}, and appears before first. +@samp{8.49} is numerically less than @samp{8.5}, and appears before first. Version sort (@samp{sort -V}) first breaks down the string into digit and non-digit parts, and only then compares each part (see annotated -example in Version-sort ordering rules). +example in @ref{Version-sort ordering rules}). Comparing the string @samp{8.1} to @samp{8.01}, first the -@samp{8} characters are compared (and are identical), then the +@samp{8}s are compared (and are identical), then the dots (@samp{.}) are compared and are identical, and lastly the remaining digits are compared numerically (@samp{1} and @samp{01}) - -which are numerically equivalent. Hence, @samp{8.01} and @samp{8.1} +which are numerically equal. Hence, @samp{8.01} and @samp{8.1} are grouped together. Similarly, comparing @samp{8.5} to @samp{8.49} -- the @samp{8} @@ -301,10 +308,10 @@ This sorting order (where @samp{8.5} comes before @samp{8.49}) is common when assigning versions to computer programs (while perhaps not intuitive or ``natural'' for people). -@node Punctuation characters -@subsection Punctuation characters +@node Version sort punctuation +@subsection Version sort punctuation -Punctuation characters are sorted by ASCII order (rule 2.2). +Punctuation is sorted by ASCII order (rule 2.B). @example $ touch 1.0.5_src.tar.gz 1.0_src.tar.gz @@ -319,22 +326,22 @@ Based on the version-sort ordering rules, the strings are broken down into the following parts: @example - 1 @r{vs} 1 @r{(rule 3, all digit characters)} - . @r{vs} . @r{(rule 2, all non-digit characters)} + 1 @r{vs} 1 @r{(rule 3, all digits)} + . @r{vs} . @r{(rule 2, all non-digits)} 0 @r{vs} 0 @r{(rule 3)} . @r{vs} _src.tar.gz @r{(rule 2)} - 5 @r{vs} empty string @r{(no more character in the file name)} + 5 @r{vs} empty string @r{(no more bytes in the file name)} _src.tar.gz @r{vs} empty string @end example The fourth parts (@samp{.} and @samp{_src.tar.gz}) are compared -lexically by ASCII order. The character @samp{.} (ASCII value 46) is -smaller than @samp{_} (ASCII value 95) -- and should be listed before it. +lexically by ASCII order. The @samp{.} (ASCII value 46) is +less than @samp{_} (ASCII value 95) -- and should be listed before it. Hence, @file{1.0.5_src.tar.gz} is listed first. -If a different character appears instead of the underscore (for -example, percent sign @samp{%} ASCII value 37, which is smaller +If a different byte appears instead of the underscore (for +example, percent sign @samp{%} ASCII value 37, which is less than dot's ASCII value of 46), that file will be listed first: @example @@ -344,7 +351,7 @@ $ touch 1.0.5_src.tar.gz 1.0%zzzzz.gz @end example The same reasoning applies to the following example, as @samp{.} with -ASCII value 46 is smaller than @samp{/} with ASCII value 47: +ASCII value 46 is less than @samp{/} with ASCII value 47: @example $ cat input5 @@ -359,7 +366,7 @@ $ sort -V input5 @node Punctuation vs letters @subsection Punctuation vs letters -Rule 2.2.1 dictates that letters sorts earlier than all non-letters +Rule 2.B.a says letters sort before non-letters (after breaking down a string to digit and non-digit parts). @example @@ -372,23 +379,23 @@ a% @end example The input strings consist entirely of non-digits, and based on the -above algorithm have only one part, all non-digit characters +above algorithm have only one part, all non-digits (@samp{a%} vs @samp{az}). Each part is then compared lexically, -character-by-character. @samp{a} compares identically in both +byte-by-byte; @samp{a} compares identically in both strings. -Rule 2.2.1 dictates that letters (@samp{z}) sorts earlier than all -non-letters (@samp{%}) -- hence @samp{az} appears first (despite +Rule 2.B.a says a letter like @samp{z} sorts before +a non-letter like @samp{%} -- hence @samp{az} appears first (despite @samp{z} having ASCII value of 122, much larger than @samp{%} with ASCII value 37). -@node The tilde @samp{~} character -@subsection The tilde @samp{~} character +@node The tilde @samp{~} +@subsection The tilde @samp{~} -Rule 2.2.2 dictates that the tilde character @samp{~} (ASCII 126) sorts -before all other non-digit characters, including an empty part. +Rule 2.B.b says the tilde @samp{~} (ASCII 126) sorts +before other bytes, and before an empty string. @example $ cat input7 @@ -413,13 +420,13 @@ with a non-digit (@samp{~}). This is the first part. All other lines in the input file start with a digit -- their first non-digit part is empty. -Based on rule 2.2.2, tilde @samp{~} sorts before all other non-digits -including the empty part -- hence it comes before all other strings, +Based on rule 2.B.b, tilde @samp{~} sorts before other bytes +and before the empty string -- hence it comes before all other strings, and is listed first in the sorted output. The remaining lines (@samp{1}, @samp{1%}, @samp{1.2}, @samp{1~}) follow similar logic: The digit part is extracted (1 for all strings) -and compares identical. The following extracted parts for the remaining +and compares equal. The following extracted parts for the remaining input lines are: empty part, @samp{%}, @samp{.}, @samp{~}. Tilde sorts before all others, hence the line @samp{1~} appears next. @@ -452,8 +459,8 @@ aα Ignoring the first letter (@samp{a}) which is identical in all strings, the compared values are: -@samp{a} and @samp{z} are letters, and sort earlier than -all other non-digit characters. +@samp{a} and @samp{z} are letters, and sort before +all other non-digits. Then, percent sign @samp{%} (ASCII value 37) is compared to the first byte of the UTF-8 sequence of @samp{α}, which is 0xCE or 206). The @@ -467,8 +474,8 @@ official Debian algorithm, in order to accommodate more general usage and file name listing. -@node Hyphen-minus and colon characters -@subsection Hyphen-minus @samp{-} and colon @samp{:} characters +@node Hyphen-minus and colon +@subsection Hyphen-minus @samp{-} and colon @samp{:} In Debian's version string syntax the version consists of three parts: @example @@ -488,7 +495,7 @@ Example of such version strings: @end example If the @samp{debian_revision part} is not present, -hyphen characters @samp{-} are not allowed. +hyphens @samp{-} are not allowed. If epoch is not present, colons @samp{:} are not allowed. If these parts are present, hyphen and/or colons can appear only once @@ -498,8 +505,8 @@ In GNU Coreutils, such restrictions are not reasonable (a file name can have many hyphens, a line of text can have many colons). As a result, in GNU Coreutils hyphens and colons are treated exactly -like all other punctuation characters, i.e., they are sorted after -letters. @xref{Punctuation characters}. +like all other punctuation, i.e., they are sorted after +letters. @xref{Version sort punctuation}. In Debian, these characters are treated differently than in Coreutils: a version string with hyphen will sort before similar strings without @@ -522,29 +529,27 @@ out of order For further details, see @ref{Comparing two strings using Debian's algorithm} and @uref{https://bugs.gnu.org/35939,GNU Bug 35939}. -@node Additional hard-coded priorities in GNU Coreutils version sort -@subsection Additional hard-coded priorities in GNU Coreutils version sort +@node Special priority in GNU Coreutils version sort +@subsection Special priority in GNU Coreutils version sort In GNU Coreutils version sort, the following items have -special priority and sort earlier than all other characters (listed in -order); +special priority and sort before all other strings (listed in order): @enumerate @item The empty string -@item The string @samp{.} (a single dot character, ASCII 46) +@item The string @samp{.} (a single dot, ASCII 46) -@item The string @samp{..} (two dot characters) +@item The string @samp{..} (two dots) -@item Strings start with a dot (@samp{.}) sort earlier than -strings starting with any other characters. +@item Strings starting with dot (@samp{.}) sort before +strings starting with any other byte. @end enumerate Example: @example $ printf '%s\n' a "" b "." c ".." ".d20" ".d3" | sort -V - . .. .d3 @@ -567,33 +572,35 @@ program, the ordering rules are the same. @subsection Special handling of file extensions GNU Coreutils version sort implements specialized handling -of file extensions (or strings that look like file names with -extensions). - -This nuanced implementation enables slightly more natural ordering of files. +of strings that look like file names with extensions. +This enables slightly more natural ordering of file +names. -The additional rules are: +The following additional rules apply when comparing two strings where +both begin with non-@samp{.}. They also apply when comparing two +strings where both begin with @samp{.} but neither is @samp{.} or @samp{..}. @enumerate @item -A suffix (i.e., a file extension) is defined as: a dot, followed by a -letter or tilde, followed by one or more letters, digits, or tildes -(possibly repeated more than once), until the end of the string -(technically, matching the regular expression -@code{(\.[A-Za-z~][A-Za-z0-9~]*)*}). +A suffix (i.e., a file extension) is defined as: a dot, followed by an +ASCII letter or tilde, followed by zero or more ASCII letters, digits, +or tildes; all repeated zero or more times, and ending at string end. +This is equivalent to matching the extended regular expression +@code{(\.[A-Za-z~][A-Za-z0-9~]*)*$} in the C locale. @item -If the strings contains suffixes, the suffixes are temporarily -removed, and the strings are compared without them (using the -@ref{Version-sort ordering rules,algorithm,algorithm} above). +The suffixes are temporarily removed, and the strings are compared +without them, using version sort (see @ref{Version-sort ordering +rules}) without special priority (see @ref{Special priority in GNU +Coreutils version sort}). @item -If the suffix-less strings are identical, the suffix is restored and -the entire strings are compared. +If the suffix-less strings do not compare equal, this comparison +result is used and the suffixes are effectively ignored. @item -If the non-suffixed strings differ, the result is returned and the -suffix is effectively ignored. +If the suffix-less strings compare equal, the suffixes are restored +and the entire strings are compared using version sort. @end enumerate Examples for rule 1: @@ -619,6 +626,9 @@ is not included) @item @samp{gcc-c++-10.8.12-0.7rc2.fc9.tar.bz2}: the suffix is @samp{.fc9.tar.bz2} (@samp{.7rc2} is not included as it begins with a digit) + +@item +@samp{.autom4te.cfg}: the suffix is the entire string. @end itemize Examples for rule 2: @@ -665,8 +675,8 @@ Without the suffix-removal algorithm, the strings will be broken down to the following parts: @example -hello- @r{vs} hello- @r{(rule 2, all non-digit characters)} -8 @r{vs} 8 @r{(rule 3, all digit characters)} +hello- @r{vs} hello- @r{(rule 2, all non-digits)} +8 @r{vs} 8 @r{(rule 3, all digits)} .txt @r{vs} . @r{(rule 2)} empty @r{vs} 2 empty @r{vs} .txt @@ -685,8 +695,8 @@ The suffixes (@samp{.txt}) are removed, and the remaining strings are broken down into the following parts: @example -hello- @r{vs} hello- @r{(rule 2, all non-digit characters)} -8 @r{vs} 8 @r{(rule 3, all digit characters)} +hello- @r{vs} hello- @r{(rule 2, all non-digits)} +8 @r{vs} 8 @r{(rule 3, all digits)} empty @r{vs} . @r{(rule 2)} empty @r{vs} 2 @end example @@ -754,7 +764,7 @@ dpkg: warning: version '3.0/' has bad syntax: To illustrate the different handling of hyphens between Debian and Coreutils algorithms (see -@ref{Hyphen-minus and colon characters}): +@ref{Hyphen-minus and colon}): @example $ compver abb ab-cd 2>/dev/null $ printf 'abb\nab-cd\n' | sort -V diff --git a/src/ls.c b/src/ls.c index 53f80076b..1930e4abb 100644 --- a/src/ls.c +++ b/src/ls.c @@ -3941,18 +3941,20 @@ DEFINE_SORT_FUNCTIONS (extension, cmp_extension) DEFINE_SORT_FUNCTIONS (width, cmp_width) /* Compare file versions. - Unlike all other compare functions above, cmp_version depends only - on filevercmp, which does not fail (even for locale reasons), and does not - need a secondary sort key. See lib/filevercmp.h for function description. + Unlike the other compare functions, cmp_version does not fail + because filevercmp and strcmp do not fail; cmp_version uses strcmp + instead of xstrcoll because filevercmp is locale-independent so + strcmp is its appropriate secondary. - All the other sort options, in fact, need xstrcoll and strcmp variants, - because they all use a string comparison (either as the primary or secondary + All the other sort options need xstrcoll and strcmp variants, + because they all use xstrcoll (either as the primary or secondary sort key), and xstrcoll has the ability to do a longjmp if strcoll fails for - locale reasons. Lastly, filevercmp is ALWAYS available with gnulib. */ + locale reasons. */ static int cmp_version (struct fileinfo const *a, struct fileinfo const *b) { - return filevercmp (a->name, b->name); + int diff = filevercmp (a->name, b->name); + return diff ? diff : strcmp (a->name, b->name); } static int diff --git a/src/sort.c b/src/sort.c index 1a3bda698..3b775d6bb 100644 --- a/src/sort.c +++ b/src/sort.c @@ -2703,7 +2703,7 @@ keycompare (struct line const *a, struct line const *b) else if (key->random) diff = compare_random (ta, tlena, tb, tlenb); else if (key->version) - diff = filevercmp (ta, tb); + diff = filenvercmp (ta, tlena, tb, tlenb); else { /* Locale-dependent string sorting. This is slower than diff --git a/tests/misc/ls-misc.pl b/tests/misc/ls-misc.pl index e246b5c20..a2ac60b23 100755 --- a/tests/misc/ls-misc.pl +++ b/tests/misc/ls-misc.pl @@ -113,8 +113,12 @@ sub make_j_d () chmod 0555, 'j/d' or die "making j/d executable: $!\n"; } -my @v1 = (qw(0 9 A Z a z), 'zz~', 'zz', 'zz.~1~', 'zz.0'); -my @v_files = ((map { ".$_" } @v1), @v1); +my @v_files = ( + '.A', '.Z', '.a', '.z', '.zz~', '.zz', '.zz.~1~', + '.0', '.9', '.zz.0', + '0', '9', + 'A', 'Z', 'a', 'z', 'zz~', 'zz', 'zz.~1~', + 'zz.0'); my $exe_in_subdir = {PRE => sub { make_j_d (); push_ls_colors('ex=01;32') }}; my $remove_j = {POST => sub {unlink 'j/d' or die "j/d: $!\n"; rmdir 'j' or die "j: $!\n"; diff --git a/tests/misc/sort-version.sh b/tests/misc/sort-version.sh index 3786605d6..10ea6a56c 100755 --- a/tests/misc/sort-version.sh +++ b/tests/misc/sort-version.sh @@ -58,6 +58,7 @@ string start 5.8.0 end of str string start 5.80.0 end of str string start 5.9.0 end of str string start 5.90.0 end of str +string start 5.04.0 end of str _EOF_ cat > exp << _EOF_ @@ -85,6 +86,7 @@ string start 5.1.0 end of str string start 5.2.0 end of str string start 5.3.0 end of str string start 5.4.0 end of str +string start 5.04.0 end of str string start 5.5.0 end of str string start 5.6.0 end of str string start 5.7.0 end of str @@ -101,6 +103,12 @@ string start 5.80.0 end of str string start 5.90.0 end of str _EOF_ -sort --sort=version -o out in || fail=1 +sort --stable --sort=version -o out in || fail=1 compare exp out || fail=1 + +tr ' ' '\0' in0 || framework_failure_ +sort --stable --sort=version -o out0 in0 || fail=1 +tr '\0' ' ' out1 || framework_failure_ +compare exp out1 || fail=1 + Exit $fail -- 2.32.0