bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

range expressions in regexps in non-C locale


From: Bruno Haible
Subject: range expressions in regexps in non-C locale
Date: Sun, 19 Feb 2012 15:08:03 +0100
User-agent: KMail/4.7.4 (Linux/3.1.0-1.2-desktop; KDE/4.7.4; x86_64; ; )

Hi,

In basic regular expressions, range expressions are not safe to use outside
the C locale; the results vary between implementations and locales.

- For 'grep' this was explained in
  https://lists.gnu.org/archive/html/bug-grep/2011-06/msg00031.html
  https://lists.gnu.org/archive/html/bug-grep/2012-01/msg00088.html
  http://savannah.gnu.org/bugs/?32337

- For 'tr' I'm seeing this on Solaris 11 2011-11 in de_DE.UTF-8 locale:
$ echo abcdefghijklmnopqrstuvwxyz | /usr/bin/tr 'a-z' 'A-Z'
AbcdefghijklmnopqrstuvwxyZ
$ echo abcdefghijklmnopqrstuvwxyz | /usr/bin/tr '[a-z]' '[A-Z]'
ABCDEFGHIJKLMNOPQRSTUVWXYZ
$ echo abcdefghijklmnopqrstuvwxyz | /usr/xpg4/bin/tr 'a-z' 'A-Z'
ABⓒ𝚍ⓔFGH𝙞JK𝚕ⓜNOPQⓡSTUⓥWⓧYZ
$ echo abcdefghijklmnopqrstuvwxyz | /usr/xpg4/bin/tr '[a-z]' '[A-Z]'
ABⓒ𝚍ⓔFGH𝙞JK𝚕ⓜNOPQⓡSTUⓥWⓧYZ
$ echo abcdefghijklmnopqrstuvwxyz | /usr/xpg6/bin/tr 'a-z' 'A-Z'
ABⓒ𝚍ⓔFGH𝙞JK𝚕ⓜNOPQⓡSTUⓥWⓧYZ
$ echo abcdefghijklmnopqrstuvwxyz | /usr/xpg6/bin/tr '[a-z]' '[A-Z]'
ABⓒ𝚍ⓔFGH𝙞JK𝚕ⓜNOPQⓡSTUⓥWⓧYZ
Whereas in C locale:
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/bin/tr 'a-z' 'A-Z'
AbcdefghijklmnopqrstuvwxyZ
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/bin/tr '[a-z]' '[A-Z]'
ABCDEFGHIJKLMNOPQRSTUVWXYZ
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/xpg4/bin/tr 'a-z' 'A-Z'
ABCDEFGHIJKLMNOPQRSTUVWXYZ
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/xpg4/bin/tr '[a-z]' '[A-Z]'
ABCDEFGHIJKLMNOPQRSTUVWXYZ
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/xpg6/bin/tr 'a-z' 'A-Z'
ABCDEFGHIJKLMNOPQRSTUVWXYZ
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/xpg6/bin/tr '[a-z]' '[A-Z]'
ABCDEFGHIJKLMNOPQRSTUVWXYZ

Gnulib uses this idiom in these files:

  tests/test-pipe-filter-gi1.c:29:/* Pipe a text file through 'tr "[a-z]" 
"[A-Z]"', which converts ASCII
  tests/test-pipe-filter-gi1.c:100:    argv[2] = "[A-Z]";
  tests/test-pipe-filter-ii1.c:29:/* Pipe a text file through 'tr "[a-z]" 
"[A-Z]"', which converts ASCII
  tests/test-pipe-filter-ii1.c:123:    argv[2] = "[A-Z]";

Fixed through the patch below.

  build-aux/bootstrap:426:    appvar=`echo $app | tr '[a-z]-' '[A-Z]_'`
  m4/host-os.m4:69:           expr "X$host_os" : 'X\([A-Za-z]\)' | tr '[a-z]' 
'[A-Z]'
  m4/fnmatch.m4:24:    echo $gl_fnmatch_required | tr '[[A-Z]]' '[[a-z]]'

Although in these situations LC_ALL is already set to C, it's useful for
consistency between these scripts and other scripts to systematically set
LC_ALL=C before such 'tr' commands.

  top/maint.mk:347:         | grep -E '"[A-Z]'                                  
        \

Is this code meant to catch non-ASCII uppercase error messages as well?


2012-02-19  Bruno Haible  <address@hidden>

        Fix test failure in many locales on Solaris 11.
        * tests/test-pipe-filter-gi1.c (main): Don't use range expression in
        'tr' arguments.
        * tests/test-pipe-filter-ii1.c (main): Likewise.
        * build-aux/bootstrap (check_versions): Run 'tr' command with range
        expressions in the C locale.
        * m4/fnmatch.m4 (gl_FUNC_FNMATCH_POSIX): Likewise.
        * m4/host-os.m4 (gl_HOST_OS): Likewise.

--- build-aux/bootstrap.orig    Sun Feb 19 15:02:49 2012
+++ build-aux/bootstrap Sun Feb 19 14:53:08 2012
@@ -423,7 +423,7 @@
       $use_git || continue
     fi
     # Honor $APP variables ($TAR, $AUTOCONF, etc.)
-    appvar=`echo $app | tr '[a-z]-' '[A-Z]_'`
+    appvar=`echo $app | LC_ALL=C tr '[a-z]-' '[A-Z]_'`
     test "$appvar" = TAR && appvar=AMTAR
     case $appvar in
         GZIP) ;; # Do not use $GZIP:  it contains gzip options.
--- m4/fnmatch.m4.orig  Sun Feb 19 15:02:49 2012
+++ m4/fnmatch.m4       Sun Feb 19 14:53:33 2012
@@ -1,4 +1,4 @@
-# Check for fnmatch - serial 8.
+# Check for fnmatch - serial 9.
 
 # Copyright (C) 2000-2007, 2009-2012 Free Software Foundation, Inc.
 # This file is free software; the Free Software Foundation
@@ -21,7 +21,7 @@
 
   FNMATCH_H=
   gl_fnmatch_required_lowercase=`
-    echo $gl_fnmatch_required | tr '[[A-Z]]' '[[a-z]]'
+    echo $gl_fnmatch_required | LC_ALL=C tr '[[A-Z]]' '[[a-z]]'
   `
   gl_fnmatch_cache_var="gl_cv_func_fnmatch_${gl_fnmatch_required_lowercase}"
   AC_CACHE_CHECK([for working $gl_fnmatch_required fnmatch],
--- m4/host-os.m4.orig  Sun Feb 19 15:02:49 2012
+++ m4/host-os.m4       Sun Feb 19 14:54:02 2012
@@ -1,4 +1,4 @@
-# serial 8
+# serial 9
 
 # Copyright (C) 2001, 2003-2004, 2006, 2009-2012 Free Software Foundation, Inc.
 # This file is free software; the Free Software Foundation
@@ -66,7 +66,7 @@
        # from $host_os, but capitalizes its first letter.
        [A-Za-z]*)
          os=`
-           expr "X$host_os" : 'X\([A-Za-z]\)' | tr '[a-z]' '[A-Z]'
+           expr "X$host_os" : 'X\([A-Za-z]\)' | LC_ALL=C tr '[a-z]' '[A-Z]'
          ``
            expr "X$host_os" : 'X.\([A-Za-z]*\)'
          `
--- tests/test-pipe-filter-gi1.c.orig   Sun Feb 19 15:02:49 2012
+++ tests/test-pipe-filter-gi1.c        Sun Feb 19 15:02:25 2012
@@ -26,8 +26,9 @@
 #include "macros.h"
 
 
-/* Pipe a text file through 'tr "[a-z]" "[A-Z]"', which converts ASCII
-   characters from lower case to upper case.  */
+/* Pipe a text file through 'LC_ALL=C tr "[a-z]" "[A-Z]"', or equivalently,
+   'tr "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"', which
+   converts ASCII characters from lower case to upper case.  */
 
 struct locals
 {
@@ -96,8 +97,8 @@
     l.nread = 0;
 
     argv[0] = tr_program;
-    argv[1] = "[a-z]";
-    argv[2] = "[A-Z]";
+    argv[1] = "abcdefghijklmnopqrstuvwxyz";
+    argv[2] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
     argv[3] = NULL;
 
     f = pipe_filter_gi_create ("tr", tr_program, argv, false, true,
--- tests/test-pipe-filter-ii1.c.orig   Sun Feb 19 15:02:49 2012
+++ tests/test-pipe-filter-ii1.c        Sun Feb 19 15:02:33 2012
@@ -26,8 +26,9 @@
 #include "macros.h"
 
 
-/* Pipe a text file through 'tr "[a-z]" "[A-Z]"', which converts ASCII
-   characters from lower case to upper case.  */
+/* Pipe a text file through 'LC_ALL=C tr "[a-z]" "[A-Z]"', or equivalently,
+   'tr "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"', which
+   converts ASCII characters from lower case to upper case.  */
 
 struct locals
 {
@@ -119,8 +120,8 @@
     l.nread = 0;
 
     argv[0] = tr_program;
-    argv[1] = "[a-z]";
-    argv[2] = "[A-Z]";
+    argv[1] = "abcdefghijklmnopqrstuvwxyz";
+    argv[2] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
     argv[3] = NULL;
 
     result = pipe_filter_ii_execute ("tr", tr_program, argv, false, true,




reply via email to

[Prev in Thread] Current Thread [Next in Thread]