bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 2/2] regex: fix ignore-case Turkish bug


From: Paul Eggert
Subject: [PATCH 2/2] regex: fix ignore-case Turkish bug
Date: Wed, 23 Sep 2020 17:05:03 -0700

* lib/regex_internal.c (build_wcs_upper_buffer):
Do not assume that converting single-byte character to upper
yields a single-byte character.  This is not true for Turkish,
where towupper (L'i') yields L'İ', which is not single-byte.
* tests/test-regex.c (main): Test for this bug.
---
 ChangeLog            |  7 +++++++
 lib/regex_internal.c | 19 ++++++++++---------
 tests/test-regex.c   | 41 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index d15f158ab..5c4d8f849 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
 2020-09-23  Paul Eggert  <eggert@cs.ucla.edu>
 
+       regex: fix ignore-case Turkish bug
+       * lib/regex_internal.c (build_wcs_upper_buffer):
+       Do not assume that converting single-byte character to upper
+       yields a single-byte character.  This is not true for Turkish,
+       where towupper (L'i') yields L'İ', which is not single-byte.
+       * tests/test-regex.c (main): Test for this bug.
+
        regex: port to weird isascii platforms
        * lib/regex_internal.h (isascii) [!_LIBC]: Supply glibc version.
 
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index e1b6b4d5a..ed0a13461 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -300,18 +300,20 @@ build_wcs_upper_buffer (re_string_t *pstr)
       while (byte_idx < end_idx)
        {
          wchar_t wc;
+         unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 
-         if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
-             && mbsinit (&pstr->cur_state))
+         if (isascii (ch) && mbsinit (&pstr->cur_state))
            {
-             /* In case of a singlebyte character.  */
-             pstr->mbs[byte_idx]
-               = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
              /* The next step uses the assumption that wchar_t is encoded
                 ASCII-safe: all ASCII values can be converted like this.  */
-             pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
-             ++byte_idx;
-             continue;
+             wchar_t wcu = __towupper (ch);
+             if (isascii (wcu))
+               {
+                 pstr->mbs[byte_idx] = wcu;
+                 pstr->wcs[byte_idx] = wcu;
+                 byte_idx++;
+                 continue;
+               }
            }
 
          remain_len = end_idx - byte_idx;
@@ -348,7 +350,6 @@ build_wcs_upper_buffer (re_string_t *pstr)
            {
              /* It is an invalid character, an incomplete character
                 at the end of the string, or '\0'.  Just use the byte.  */
-             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
              pstr->mbs[byte_idx] = ch;
              /* And also cast it to wide char.  */
              pstr->wcs[byte_idx++] = (wchar_t) ch;
diff --git a/tests/test-regex.c b/tests/test-regex.c
index d3f429aeb..b4e23c8c8 100644
--- a/tests/test-regex.c
+++ b/tests/test-regex.c
@@ -29,6 +29,15 @@
 
 #include "localcharset.h"
 
+/* Check whether it's really a UTF-8 locale.
+   On mingw, setlocale (LC_ALL, "en_US.UTF-8") succeeds but returns
+   "English_United States.1252", with locale_charset () returning "CP1252".  */
+static int
+really_utf8 (void)
+{
+  return strcmp (locale_charset (), "UTF-8") == 0;
+}
+
 int
 main (void)
 {
@@ -75,11 +84,7 @@ main (void)
           }
       }
 
-      /* Check whether it's really a UTF-8 locale.
-         On mingw, the setlocale call succeeds but returns
-         "English_United States.1252", with locale_charset() returning
-         "CP1252".  */
-      if (strcmp (locale_charset (), "UTF-8") == 0)
+      if (really_utf8 ())
         {
           /* This test is from glibc bug 15078.
              The test case is from Andreas Schwab in
@@ -119,6 +124,32 @@ main (void)
         return 1;
     }
 
+  if (setlocale (LC_ALL, "tr_TR.UTF-8") && really_utf8 ())
+    {
+      re_set_syntax (RE_SYNTAX_GREP | RE_ICASE);
+      if (re_compile_pattern ("i", 1, &regex))
+        result |= 1;
+      else
+        {
+          /* UTF-8 encoding of U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE.
+             In Turkish, this is the upper-case equivalent of ASCII "i".
+             Older versions of Gnulib failed to match "i" to U+0130 when
+             ignoring case in Turkish <https://bugs.gnu.org/43577>.  */
+          static char const data[] = "\xc4\xb0";
+
+          memset (&regs, 0, sizeof regs);
+          if (re_search (&regex, data, sizeof data - 1, 0, sizeof data - 1,
+                         &regs))
+            result |= 1;
+          regfree (&regex);
+          free (regs.start);
+          free (regs.end);
+
+          if (! setlocale (LC_ALL, "C"))
+            return 1;
+        }
+    }
+
   /* This test is from glibc bug 3957, reported by Andrew Mackey.  */
   re_set_syntax (RE_SYNTAX_EGREP | RE_HAT_LISTS_NOT_NEWLINE);
   memset (&regex, 0, sizeof regex);
-- 
2.25.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]