From bd6d66e502786df21d2dcaa7b473ee851f840aaa Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Sun, 27 Nov 2016 15:36:51 -0800 Subject: [PATCH] dfa: avoid false match in non-UTF8 multibyte locales * lib/dfa.c (dfa_supported): Treat any non-UTF8 multibyte locale as "not supported" so that callers will resort to using regex-based matcher. This will surely hurt performance, but correctness trumps performance here, and the affected locales are less and less relevant, these days. See grep's bug report https://bugs.gnu.org/24975. --- ChangeLog | 9 +++++++++ lib/dfa.c | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index 0db3da8..fec4fb9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2016-11-27 Jim Meyering + + dfa: avoid false match in non-UTF8 multibyte locales + * lib/dfa.c (dfa_supported): Treat any non-UTF8 multibyte locale + as "not supported" so that callers will resort to using regex-based + matcher. This will surely hurt performance, but correctness trumps + performance here, and the affected locales are less and less relevant, + these days. See grep's bug report https://bugs.gnu.org/24975. + 2016-11-27 Mike Frysinger ptsname_r: leverage AC_HEADER_MAJOR to provide major() diff --git a/lib/dfa.c b/lib/dfa.c index 5578232..f0ed139 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -3272,6 +3272,12 @@ free_mbdata (struct dfa *d) static bool _GL_ATTRIBUTE_PURE dfa_supported (struct dfa const *d) { + /* Declare any non-UTF8 multibyte locale "not supported." Otherwise, a + regexp like ".*7" would mistakenly match \uC9, e.g., via this command: + (export LC_ALL=zh_CN.gb18030; printf '\uC9\n' | grep '.*7') */ + if (d->localeinfo.multibyte && !d->localeinfo.using_utf8) + return false; + size_t i; for (i = 0; i < d->tindex; i++) { -- 2.9.3