[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
range expressions in regexps in non-C locale
From: |
Bruno Haible |
Subject: |
range expressions in regexps in non-C locale |
Date: |
Sun, 19 Feb 2012 15:08:03 +0100 |
User-agent: |
KMail/4.7.4 (Linux/3.1.0-1.2-desktop; KDE/4.7.4; x86_64; ; ) |
Hi,
In basic regular expressions, range expressions are not safe to use outside
the C locale; the results vary between implementations and locales.
- For 'grep' this was explained in
https://lists.gnu.org/archive/html/bug-grep/2011-06/msg00031.html
https://lists.gnu.org/archive/html/bug-grep/2012-01/msg00088.html
http://savannah.gnu.org/bugs/?32337
- For 'tr' I'm seeing this on Solaris 11 2011-11 in de_DE.UTF-8 locale:
$ echo abcdefghijklmnopqrstuvwxyz | /usr/bin/tr 'a-z' 'A-Z'
AbcdefghijklmnopqrstuvwxyZ
$ echo abcdefghijklmnopqrstuvwxyz | /usr/bin/tr '[a-z]' '[A-Z]'
ABCDEFGHIJKLMNOPQRSTUVWXYZ
$ echo abcdefghijklmnopqrstuvwxyz | /usr/xpg4/bin/tr 'a-z' 'A-Z'
ABⓒ𝚍ⓔFGH𝙞JK𝚕ⓜNOPQⓡSTUⓥWⓧYZ
$ echo abcdefghijklmnopqrstuvwxyz | /usr/xpg4/bin/tr '[a-z]' '[A-Z]'
ABⓒ𝚍ⓔFGH𝙞JK𝚕ⓜNOPQⓡSTUⓥWⓧYZ
$ echo abcdefghijklmnopqrstuvwxyz | /usr/xpg6/bin/tr 'a-z' 'A-Z'
ABⓒ𝚍ⓔFGH𝙞JK𝚕ⓜNOPQⓡSTUⓥWⓧYZ
$ echo abcdefghijklmnopqrstuvwxyz | /usr/xpg6/bin/tr '[a-z]' '[A-Z]'
ABⓒ𝚍ⓔFGH𝙞JK𝚕ⓜNOPQⓡSTUⓥWⓧYZ
Whereas in C locale:
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/bin/tr 'a-z' 'A-Z'
AbcdefghijklmnopqrstuvwxyZ
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/bin/tr '[a-z]' '[A-Z]'
ABCDEFGHIJKLMNOPQRSTUVWXYZ
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/xpg4/bin/tr 'a-z' 'A-Z'
ABCDEFGHIJKLMNOPQRSTUVWXYZ
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/xpg4/bin/tr '[a-z]' '[A-Z]'
ABCDEFGHIJKLMNOPQRSTUVWXYZ
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/xpg6/bin/tr 'a-z' 'A-Z'
ABCDEFGHIJKLMNOPQRSTUVWXYZ
$ echo abcdefghijklmnopqrstuvwxyz | LC_ALL=C /usr/xpg6/bin/tr '[a-z]' '[A-Z]'
ABCDEFGHIJKLMNOPQRSTUVWXYZ
Gnulib uses this idiom in these files:
tests/test-pipe-filter-gi1.c:29:/* Pipe a text file through 'tr "[a-z]"
"[A-Z]"', which converts ASCII
tests/test-pipe-filter-gi1.c:100: argv[2] = "[A-Z]";
tests/test-pipe-filter-ii1.c:29:/* Pipe a text file through 'tr "[a-z]"
"[A-Z]"', which converts ASCII
tests/test-pipe-filter-ii1.c:123: argv[2] = "[A-Z]";
Fixed through the patch below.
build-aux/bootstrap:426: appvar=`echo $app | tr '[a-z]-' '[A-Z]_'`
m4/host-os.m4:69: expr "X$host_os" : 'X\([A-Za-z]\)' | tr '[a-z]'
'[A-Z]'
m4/fnmatch.m4:24: echo $gl_fnmatch_required | tr '[[A-Z]]' '[[a-z]]'
Although in these situations LC_ALL is already set to C, it's useful for
consistency between these scripts and other scripts to systematically set
LC_ALL=C before such 'tr' commands.
top/maint.mk:347: | grep -E '"[A-Z]'
\
Is this code meant to catch non-ASCII uppercase error messages as well?
2012-02-19 Bruno Haible <address@hidden>
Fix test failure in many locales on Solaris 11.
* tests/test-pipe-filter-gi1.c (main): Don't use range expression in
'tr' arguments.
* tests/test-pipe-filter-ii1.c (main): Likewise.
* build-aux/bootstrap (check_versions): Run 'tr' command with range
expressions in the C locale.
* m4/fnmatch.m4 (gl_FUNC_FNMATCH_POSIX): Likewise.
* m4/host-os.m4 (gl_HOST_OS): Likewise.
--- build-aux/bootstrap.orig Sun Feb 19 15:02:49 2012
+++ build-aux/bootstrap Sun Feb 19 14:53:08 2012
@@ -423,7 +423,7 @@
$use_git || continue
fi
# Honor $APP variables ($TAR, $AUTOCONF, etc.)
- appvar=`echo $app | tr '[a-z]-' '[A-Z]_'`
+ appvar=`echo $app | LC_ALL=C tr '[a-z]-' '[A-Z]_'`
test "$appvar" = TAR && appvar=AMTAR
case $appvar in
GZIP) ;; # Do not use $GZIP: it contains gzip options.
--- m4/fnmatch.m4.orig Sun Feb 19 15:02:49 2012
+++ m4/fnmatch.m4 Sun Feb 19 14:53:33 2012
@@ -1,4 +1,4 @@
-# Check for fnmatch - serial 8.
+# Check for fnmatch - serial 9.
# Copyright (C) 2000-2007, 2009-2012 Free Software Foundation, Inc.
# This file is free software; the Free Software Foundation
@@ -21,7 +21,7 @@
FNMATCH_H=
gl_fnmatch_required_lowercase=`
- echo $gl_fnmatch_required | tr '[[A-Z]]' '[[a-z]]'
+ echo $gl_fnmatch_required | LC_ALL=C tr '[[A-Z]]' '[[a-z]]'
`
gl_fnmatch_cache_var="gl_cv_func_fnmatch_${gl_fnmatch_required_lowercase}"
AC_CACHE_CHECK([for working $gl_fnmatch_required fnmatch],
--- m4/host-os.m4.orig Sun Feb 19 15:02:49 2012
+++ m4/host-os.m4 Sun Feb 19 14:54:02 2012
@@ -1,4 +1,4 @@
-# serial 8
+# serial 9
# Copyright (C) 2001, 2003-2004, 2006, 2009-2012 Free Software Foundation, Inc.
# This file is free software; the Free Software Foundation
@@ -66,7 +66,7 @@
# from $host_os, but capitalizes its first letter.
[A-Za-z]*)
os=`
- expr "X$host_os" : 'X\([A-Za-z]\)' | tr '[a-z]' '[A-Z]'
+ expr "X$host_os" : 'X\([A-Za-z]\)' | LC_ALL=C tr '[a-z]' '[A-Z]'
``
expr "X$host_os" : 'X.\([A-Za-z]*\)'
`
--- tests/test-pipe-filter-gi1.c.orig Sun Feb 19 15:02:49 2012
+++ tests/test-pipe-filter-gi1.c Sun Feb 19 15:02:25 2012
@@ -26,8 +26,9 @@
#include "macros.h"
-/* Pipe a text file through 'tr "[a-z]" "[A-Z]"', which converts ASCII
- characters from lower case to upper case. */
+/* Pipe a text file through 'LC_ALL=C tr "[a-z]" "[A-Z]"', or equivalently,
+ 'tr "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"', which
+ converts ASCII characters from lower case to upper case. */
struct locals
{
@@ -96,8 +97,8 @@
l.nread = 0;
argv[0] = tr_program;
- argv[1] = "[a-z]";
- argv[2] = "[A-Z]";
+ argv[1] = "abcdefghijklmnopqrstuvwxyz";
+ argv[2] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
argv[3] = NULL;
f = pipe_filter_gi_create ("tr", tr_program, argv, false, true,
--- tests/test-pipe-filter-ii1.c.orig Sun Feb 19 15:02:49 2012
+++ tests/test-pipe-filter-ii1.c Sun Feb 19 15:02:33 2012
@@ -26,8 +26,9 @@
#include "macros.h"
-/* Pipe a text file through 'tr "[a-z]" "[A-Z]"', which converts ASCII
- characters from lower case to upper case. */
+/* Pipe a text file through 'LC_ALL=C tr "[a-z]" "[A-Z]"', or equivalently,
+ 'tr "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"', which
+ converts ASCII characters from lower case to upper case. */
struct locals
{
@@ -119,8 +120,8 @@
l.nread = 0;
argv[0] = tr_program;
- argv[1] = "[a-z]";
- argv[2] = "[A-Z]";
+ argv[1] = "abcdefghijklmnopqrstuvwxyz";
+ argv[2] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
argv[3] = NULL;
result = pipe_filter_ii_execute ("tr", tr_program, argv, false, true,
- range expressions in regexps in non-C locale,
Bruno Haible <=