Re: [striconveh] Error handling and Unicode replacement character

bug-gnulib
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [striconveh] Error handling and Unicode replacement character

From:	Bruno Haible
Subject:	Re: [striconveh] Error handling and Unicode replacement character
Date:	Sat, 01 Jan 2022 19:55:28 +0100
Marc Nieper-Wißkirchen wrote on 2021-12-30:
> The striconveh module and related modules offer an error handler
> argument. The current possible values are:
> 
> iconveh_error
> iconveh_question_mark
> iconveh_escape_sequence
> 
> The second option replaces any unconvertible character with a question mark 
> "?".
> 
> I would like to request to add a fourth option, say,
> iconveh_replacement_character, which is like iconveh_question_mark but
> uses U+FFFD whenever the target codeset is a Unicode codeset.

That's a good suggestion, as nowadays people are frequently converting
to UTF-8 or GB18030. Implemented as follows.


2022-01-01  Bruno Haible  <bruno@clisp.org>

        striconveh: Support an error handler that produces a Unicode U+FFFD.
        Suggested by Marc Nieper-Wißkirchen in
        <https://lists.gnu.org/archive/html/bug-gnulib/2021-12/msg00175.html>.
        * lib/iconveh.h (iconveh_replacement_character): New enum value.
        * lib/striconveh.c (mem_cd_iconveh_internal): When the handler is
        iconveh_replacement_character, try to produce U+FFFD when possible,
        instead of '?'.
        * tests/test-striconveh.c (main): Add GB18030 tests. Test also
        iconveh_replacement_character.

diff --git a/lib/iconveh.h b/lib/iconveh.h
index d321d34cb..058f68ca2 100644
--- a/lib/iconveh.h
+++ b/lib/iconveh.h
@@ -29,7 +29,10 @@ enum iconv_ilseq_handler
 {
   iconveh_error,                /* return and set errno = EILSEQ */
   iconveh_question_mark,        /* use one '?' per unconvertible character */
-  iconveh_escape_sequence       /* use escape sequence \uxxxx or \Uxxxxxxxx */
+  iconveh_escape_sequence,      /* use escape sequence \uxxxx or \Uxxxxxxxx */
+  iconveh_replacement_character /* use one U+FFFD per unconvertible character
+                                   if that fits in the target encoding,
+                                   otherwise one '?' */
 };
 
 
diff --git a/lib/striconveh.c b/lib/striconveh.c
index 4aa8a2f07..612c38c3e 100644
--- a/lib/striconveh.c
+++ b/lib/striconveh.c
@@ -457,13 +457,18 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                 if (cd2 == (iconv_t)(-1))
                   {
                     /* TO_CODESET is UTF-8.  */
-                    /* Error handling can produce up to 1 byte of output.  */
-                    if (length + 1 + extra_alloc > allocated)
+                    /* Error handling can produce up to 1 or 3 bytes of
+                       output.  */
+                    size_t extra_need =
+                      (handler == iconveh_replacement_character ? 3 : 1);
+                    if (length + extra_need + extra_alloc > allocated)
                       {
                         char *memory;
 
                         allocated = 2 * allocated;
-                        if (length + 1 + extra_alloc > allocated)
+                        if (length + extra_need + extra_alloc > allocated)
+                          allocated = 2 * allocated;
+                        if (length + extra_need + extra_alloc > allocated)
                           abort ();
                         if (result == initial_result)
                           memory = (char *) malloc (allocated);
@@ -482,7 +487,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                         grow = false;
                       }
                     /* The input is invalid in FROM_CODESET.  Eat up one byte
-                       and emit a question mark.  */
+                       and emit a replacement character or a question mark.  */
                     if (!incremented)
                       {
                         if (insize == 0)
@@ -490,8 +495,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                         inptr++;
                         insize--;
                       }
-                    result[length] = '?';
-                    length++;
+                    if (handler == iconveh_replacement_character)
+                      {
+                        /* U+FFFD in UTF-8 encoding.  */
+                        result[length+0] = '\357';
+                        result[length+1] = '\277';
+                        result[length+2] = '\275';
+                        length += 3;
+                      }
+                    else
+                      {
+                        result[length] = '?';
+                        length++;
+                      }
                   }
                 else
                   goto indirectly;
@@ -594,7 +610,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
   {
     const bool slowly = (offsets != NULL || handler == iconveh_error);
 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
-    char utf8buf[utf8bufsize + 1];
+    char utf8buf[utf8bufsize + 3];
     size_t utf8len = 0;
     const char *in1ptr = src;
     size_t in1size = srclen;
@@ -682,8 +698,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
             && errno == EILSEQ && handler != iconveh_error)
           {
             /* The input is invalid in FROM_CODESET.  Eat up one byte and
-               emit a question mark.  Room for the question mark was allocated
-               at the end of utf8buf.  */
+               emit a U+FFFD character or a question mark.  Room for this
+               character was allocated at the end of utf8buf.  */
             if (!incremented1)
               {
                 if (in1size == 0)
@@ -691,7 +707,16 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                 in1ptr++;
                 in1size--;
               }
-            *out1ptr++ = '?';
+            if (handler == iconveh_replacement_character)
+              {
+                /* U+FFFD in UTF-8 encoding.  */
+                out1ptr[0] = '\357';
+                out1ptr[1] = '\277';
+                out1ptr[2] = '\275';
+                out1ptr += 3;
+              }
+            else
+              *out1ptr++ = '?';
             res1 = 0;
           }
         errno1 = errno;
@@ -756,7 +781,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                       break;
                     else if (errno == EILSEQ && handler != iconveh_error)
                       {
-                        /* Error handling can produce up to 10 bytes of ASCII
+                        /* Error handling can produce up to 10 bytes of UTF-8
                            output.  But TO_CODESET may be UCS-2, UTF-16 or
                            UCS-4, so use CD2 here as well.  */
                         char scratchbuf[10];
@@ -804,6 +829,14 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                             scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
                             scratchbuf[scratchlen++] = hex[uc & 15];
                           }
+                        else if (handler == iconveh_replacement_character)
+                          {
+                            /* U+FFFD in UTF-8 encoding.  */
+                            scratchbuf[0] = '\357';
+                            scratchbuf[1] = '\277';
+                            scratchbuf[2] = '\275';
+                            scratchlen = 3;
+                          }
                         else
                           {
                             scratchbuf[0] = '?';
@@ -813,9 +846,24 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                         inptr = scratchbuf;
                         insize = scratchlen;
                         if (cd2 != (iconv_t)(-1))
-                          res = iconv (cd2,
-                                       (ICONV_CONST char **) &inptr, &insize,
-                                       &out2ptr, &out2size);
+                          {
+                            res = iconv (cd2,
+                                         (ICONV_CONST char **) &inptr, &insize,
+                                         &out2ptr, &out2size);
+                            if (handler == iconveh_replacement_character
+                                && res == (size_t)(-1) && errno == EILSEQ)
+                              {
+                                 /* U+FFFD can't be converted to TO_CODESET.
+                                    Use '?' instead.  */
+                                scratchbuf[0] = '?';
+                                scratchlen = 1;
+                                inptr = scratchbuf;
+                                insize = scratchlen;
+                                res = iconv (cd2,
+                                             (ICONV_CONST char **) &inptr, 
&insize,
+                                             &out2ptr, &out2size);
+                              }
+                          }
                         else
                           {
                             /* TO_CODESET is UTF-8.  */
diff --git a/tests/test-striconveh.c b/tests/test-striconveh.c
index 438b7b087..781aa5254 100644
--- a/tests/test-striconveh.c
+++ b/tests/test-striconveh.c
@@ -46,14 +46,19 @@ main ()
 {
 #if HAVE_ICONV
   static enum iconv_ilseq_handler handlers[] =
-    { iconveh_error, iconveh_question_mark, iconveh_escape_sequence };
+    {
+      iconveh_error,
+      iconveh_question_mark,
+      iconveh_replacement_character,
+      iconveh_escape_sequence
+    };
   size_t indirect;
   size_t h;
   size_t o;
   size_t i;
 
   /* Assume that iconv() supports at least the encodings ASCII, ISO-8859-1,
-     ISO-8859-2, and UTF-8.  */
+     ISO-8859-2, UTF-8, and with libiconv or glibc also GB18030.  */
   iconv_t cd_ascii_to_88591 = iconv_open ("ISO-8859-1", "ASCII");
   iconv_t cd_88591_to_88592 = iconv_open ("ISO-8859-2", "ISO-8859-1");
   iconv_t cd_88592_to_88591 = iconv_open ("ISO-8859-1", "ISO-8859-2");
@@ -63,6 +68,12 @@ main ()
   iconv_t cd_88592_to_utf8 = iconv_open ("UTF-8", "ISO-8859-2");
   iconv_t cd_utf8_to_88592 = iconv_open ("ISO-8859-2", "UTF-8");
   iconv_t cd_utf7_to_utf8 = iconv_open ("UTF-8", "UTF-7");
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+  iconv_t cd_ascii_to_gb18030 = iconv_open ("GB18030", "ASCII");
+  iconv_t cd_utf8_to_gb18030 = iconv_open ("GB18030", "UTF-8");
+  iconv_t cd_88591_to_gb18030 = iconv_open ("GB18030", "ISO-8859-1");
+  iconv_t cd_utf7_to_gb18030 = iconv_open ("GB18030", "UTF-7");
+# endif
   iconveh_t cdeh_ascii_to_88591;
   iconveh_t cdeh_ascii_to_88591_indirectly;
   iconveh_t cdeh_88592_to_88591;
@@ -71,12 +82,21 @@ main ()
   iconveh_t cdeh_88591_to_utf8;
   iconveh_t cdeh_utf8_to_88591;
   iconveh_t cdeh_utf7_to_utf8;
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+  iconveh_t cdeh_ascii_to_gb18030;
+  iconveh_t cdeh_88591_to_gb18030;
+  iconveh_t cdeh_utf7_to_gb18030;
+# endif
 
   ASSERT (cd_ascii_to_utf8 != (iconv_t)(-1));
   ASSERT (cd_88591_to_utf8 != (iconv_t)(-1));
   ASSERT (cd_utf8_to_88591 != (iconv_t)(-1));
   ASSERT (cd_88592_to_utf8 != (iconv_t)(-1));
   ASSERT (cd_utf8_to_88592 != (iconv_t)(-1));
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+  ASSERT (cd_ascii_to_gb18030 != (iconv_t)(-1));
+  ASSERT (cd_utf8_to_gb18030 != (iconv_t)(-1));
+# endif
 
   cdeh_ascii_to_88591.cd = cd_ascii_to_88591;
   cdeh_ascii_to_88591.cd1 = cd_ascii_to_utf8;
@@ -110,6 +130,20 @@ main ()
   cdeh_utf7_to_utf8.cd1 = cd_utf7_to_utf8;
   cdeh_utf7_to_utf8.cd2 = (iconv_t)(-1);
 
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+  cdeh_ascii_to_gb18030.cd = cd_ascii_to_gb18030;
+  cdeh_ascii_to_gb18030.cd1 = cd_ascii_to_utf8;
+  cdeh_ascii_to_gb18030.cd2 = cd_utf8_to_gb18030;
+
+  cdeh_88591_to_gb18030.cd = cd_88591_to_gb18030;
+  cdeh_88591_to_gb18030.cd1 = cd_88591_to_utf8;
+  cdeh_88591_to_gb18030.cd2 = cd_utf8_to_gb18030;
+
+  cdeh_utf7_to_gb18030.cd = cd_utf7_to_gb18030;
+  cdeh_utf7_to_gb18030.cd1 = cd_utf7_to_utf8;
+  cdeh_utf7_to_gb18030.cd2 = cd_utf8_to_gb18030;
+# endif
+
   /* ------------------------ Test mem_cd_iconveh() ------------------------ */
 
   /* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors.  */
@@ -175,6 +209,7 @@ main ()
                     free (offsets);
                   break;
                 case iconveh_question_mark:
+                case iconveh_replacement_character:
                 case iconveh_escape_sequence:
                   {
                     static const char expected[] = "Rafa? Maszkowski";
@@ -224,6 +259,7 @@ main ()
                     free (offsets);
                   break;
                 case iconveh_question_mark:
+                case iconveh_replacement_character:
                   {
                     static const char expected[] = "Rafa? Maszkowski";
                     ASSERT (retval == 0);
@@ -294,6 +330,41 @@ main ()
         }
     }
 
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+  /* Test conversion from ISO-8859-1 to GB18030 with no errors.  */
+  for (h = 0; h < SIZEOF (handlers); h++)
+    {
+      enum iconv_ilseq_handler handler = handlers[h];
+      static const char input[] = "\304rger mit b\366sen B\374bchen ohne 
Augenma\337";
+      static const char expected[] = "\2010\2072rger mit b\2010\2132sen 
B\250\271bchen ohne Augenma\2010\2118";
+      for (o = 0; o < 2; o++)
+        {
+          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
+          char *result = NULL;
+          size_t length = 0;
+          int retval = mem_cd_iconveh (input, strlen (input),
+                                       &cdeh_88591_to_gb18030,
+                                       handler,
+                                       offsets,
+                                       &result, &length);
+          ASSERT (retval == 0);
+          ASSERT (length == strlen (expected));
+          ASSERT (result != NULL && memcmp (result, expected, strlen 
(expected)) == 0);
+          if (o)
+            {
+              for (i = 0; i < 37; i++)
+                ASSERT (offsets[i] == (i < 1 ? i :
+                                       i < 12 ? i + 3 :
+                                       i < 18 ? i + 6 :
+                                       i + 7));
+              ASSERT (offsets[37] == MAGIC);
+              free (offsets);
+            }
+          free (result);
+        }
+    }
+# endif
+
   /* Test conversion from UTF-8 to ISO-8859-1 with no errors.  */
   for (h = 0; h < SIZEOF (handlers); h++)
     {
@@ -371,10 +442,88 @@ main ()
                 free (result);
               }
               break;
+            case iconveh_replacement_character:
+              {
+                static const char expected[] = "Rafa\357\277\275 Maszkowski";
+                ASSERT (retval == 0);
+                ASSERT (length == strlen (expected));
+                ASSERT (result != NULL && memcmp (result, expected, strlen 
(expected)) == 0);
+                if (o)
+                  {
+                    for (i = 0; i < 16; i++)
+                      ASSERT (offsets[i] == (i < 5 ? i : i + 2));
+                    ASSERT (offsets[16] == MAGIC);
+                    free (offsets);
+                  }
+                free (result);
+              }
+              break;
             }
         }
     }
 
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+  /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ).  */
+  for (h = 0; h < SIZEOF (handlers); h++)
+    {
+      enum iconv_ilseq_handler handler = handlers[h];
+      static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski */
+      for (o = 0; o < 2; o++)
+        {
+          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
+          char *result = NULL;
+          size_t length = 0;
+          int retval = mem_cd_iconveh (input, strlen (input),
+                                       &cdeh_ascii_to_gb18030,
+                                       handler,
+                                       offsets,
+                                       &result, &length);
+          switch (handler)
+            {
+            case iconveh_error:
+              ASSERT (retval == -1 && errno == EILSEQ);
+              ASSERT (result == NULL);
+              if (o)
+                free (offsets);
+              break;
+            case iconveh_question_mark:
+            case iconveh_escape_sequence:
+              {
+                static const char expected[] = "Rafa? Maszkowski";
+                ASSERT (retval == 0);
+                ASSERT (length == strlen (expected));
+                ASSERT (result != NULL && memcmp (result, expected, strlen 
(expected)) == 0);
+                if (o)
+                  {
+                    for (i = 0; i < 16; i++)
+                      ASSERT (offsets[i] == i);
+                    ASSERT (offsets[16] == MAGIC);
+                    free (offsets);
+                  }
+                free (result);
+              }
+              break;
+            case iconveh_replacement_character:
+              {
+                static const char expected[] = "Rafa\2041\2447 Maszkowski";
+                ASSERT (retval == 0);
+                ASSERT (length == strlen (expected));
+                ASSERT (result != NULL && memcmp (result, expected, strlen 
(expected)) == 0);
+                if (o)
+                  {
+                    for (i = 0; i < 16; i++)
+                      ASSERT (offsets[i] == (i < 5 ? i : i + 3));
+                    ASSERT (offsets[16] == MAGIC);
+                    free (offsets);
+                  }
+                free (result);
+              }
+              break;
+            }
+        }
+    }
+# endif
+
   /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ.  */
   for (h = 0; h < SIZEOF (handlers); h++)
     {
@@ -399,6 +548,7 @@ main ()
                 free (offsets);
               break;
             case iconveh_question_mark:
+            case iconveh_replacement_character:
               {
                 static const char expected[] = "Rafa? Maszkowski";
                 ASSERT (retval == 0);
@@ -496,6 +646,34 @@ main ()
           free (result);
         }
 
+#  if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+      /* Test conversion from UTF-7 to GB18030 with EINVAL.  */
+      for (h = 0; h < SIZEOF (handlers); h++)
+        {
+          enum iconv_ilseq_handler handler = handlers[h];
+          /* This is base64 encoded 0x54 0x32 0xD8 0x3F 0xD8 0x40.  It would
+             convert to U+5432 U+D83F U+D840 but these are Unicode surrogates. 
 */
+          static const char input[] = "+VDLYP9hA";
+          static const char expected1[] = "\337\305"; /* 吲 glibc */
+          static const char expected2[] = ""; /* libiconv */
+          char *result = NULL;
+          size_t length = 0;
+          int retval = mem_cd_iconveh (input, 7,
+                                       &cdeh_utf7_to_gb18030,
+                                       handler,
+                                       NULL,
+                                       &result, &length);
+          ASSERT (retval == 0);
+          ASSERT (length == strlen (expected1) || length == strlen 
(expected2));
+          ASSERT (result != NULL);
+          if (length == strlen (expected1))
+            ASSERT (memcmp (result, expected1, strlen (expected1)) == 0);
+          else
+            ASSERT (memcmp (result, expected2, strlen (expected2)) == 0);
+          free (result);
+        }
+#  endif
+
       /* Disabled on NetBSD, because NetBSD 5.0 iconv() is buggy: it converts
          the input "+2D/YQNhB" to U+1FED8 U+3FD8 U+40D8.  */
 #  if !(defined __NetBSD__ && !defined _LIBICONV_VERSION)
@@ -544,8 +722,98 @@ main ()
                 free (result);
               }
               break;
+            case iconveh_replacement_character:
+              {
+                /* glibc result */
+                static const char expected1[] = 
"\357\277\275\357\277\275\357\277\275\357\277\275\357\277\275";
+                /* libiconv <= 1.12 result */
+                static const char expected2[] = "\357\277\2752D/YQNhB";
+                /* libiconv >= 1.13 result */
+                static const char expected3[] = 
"\357\277\275\340\277\266\341\200\266";
+                ASSERT (retval == 0);
+                ASSERT (length == strlen (expected1)
+                        || length == strlen (expected2)
+                        || length == strlen (expected3));
+                ASSERT (result != NULL);
+                if (length == strlen (expected1))
+                  ASSERT (memcmp (result, expected1, strlen (expected1)) == 0);
+                else if (length == strlen (expected2))
+                  ASSERT (memcmp (result, expected2, strlen (expected2)) == 0);
+                else
+                  ASSERT (memcmp (result, expected3, strlen (expected3)) == 0);
+                free (result);
+              }
+            }
+        }
+
+#   if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+      /* Test conversion from UTF-7 to GB18030 with EILSEQ.  */
+      for (h = 0; h < SIZEOF (handlers); h++)
+        {
+          enum iconv_ilseq_handler handler = handlers[h];
+          /* This is base64 encoded 0xD8 0x3F 0xD8 0x40 0xD8 0x41.  It would
+             convert to U+D83F U+D840 U+D841 but these are Unicode surrogates. 
 */
+          static const char input[] = "+2D/YQNhB";
+          char *result = NULL;
+          size_t length = 0;
+          int retval = mem_cd_iconveh (input, strlen (input),
+                                       &cdeh_utf7_to_gb18030,
+                                       handler,
+                                       NULL,
+                                       &result, &length);
+          switch (handler)
+            {
+            case iconveh_error:
+              ASSERT (retval == -1 && errno == EILSEQ);
+              ASSERT (result == NULL);
+              break;
+            case iconveh_question_mark:
+            case iconveh_escape_sequence:
+              {
+                /* glibc result */
+                static const char expected1[] = "?????";
+                /* libiconv <= 1.12 result */
+                static const char expected2[] = "?2D/YQNhB";
+                /* libiconv behaviour changed in version 1.13: the result is
+                   '?' U+0FF6 U+1036; this is U+D83F U+D840 U+D841 shifted left
+                   by 6 bits.  */
+                static const char expected3[] = "?\2013\2030\2013\2114";
+                ASSERT (retval == 0);
+                ASSERT (length == strlen (expected1)
+                        || length == strlen (expected2)
+                        || length == strlen (expected3));
+                ASSERT (result != NULL);
+                if (length == strlen (expected1))
+                  ASSERT (memcmp (result, expected1, strlen (expected1)) == 0);
+                else if (length == strlen (expected2))
+                  ASSERT (memcmp (result, expected2, strlen (expected2)) == 0
+                          || memcmp (result, expected3, strlen (expected3)) == 
0);
+                free (result);
+              }
+              break;
+            case iconveh_replacement_character:
+              {
+                /* glibc result */
+                static const char expected1[] = 
"\2041\2447\2041\2447\2041\2447\2041\2447\2041\2447";
+                /* libiconv <= 1.12 result */
+                static const char expected2[] = "\2041\24472D/YQNhB";
+                /* libiconv >= 1.13 result */
+                static const char expected3[] = 
"\2041\2447\2013\2030\2013\2114";
+                ASSERT (retval == 0);
+                ASSERT (length == strlen (expected1)
+                        || length == strlen (expected2)
+                        || length == strlen (expected3));
+                ASSERT (result != NULL);
+                if (length == strlen (expected1))
+                  ASSERT (memcmp (result, expected1, strlen (expected1)) == 0);
+                else if (length == strlen (expected2))
+                  ASSERT (memcmp (result, expected2, strlen (expected2)) == 0
+                          || memcmp (result, expected3, strlen (expected3)) == 
0);
+                free (result);
+              }
             }
         }
+#   endif
 #  endif
 # endif
     }
@@ -589,6 +857,7 @@ main ()
               ASSERT (result == NULL && errno == EILSEQ);
               break;
             case iconveh_question_mark:
+            case iconveh_replacement_character:
             case iconveh_escape_sequence:
               {
                 static const char expected[] = "Rafa? Maszkowski";
@@ -619,6 +888,7 @@ main ()
               ASSERT (result == NULL && errno == EILSEQ);
               break;
             case iconveh_question_mark:
+            case iconveh_replacement_character:
               {
                 static const char expected[] = "Rafa? Maszkowski";
                 ASSERT (result != NULL);
@@ -652,6 +922,22 @@ main ()
       free (result);
     }
 
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+  /* Test conversion from ISO-8859-1 to GB18030 with no errors.  */
+  for (h = 0; h < SIZEOF (handlers); h++)
+    {
+      enum iconv_ilseq_handler handler = handlers[h];
+      static const char input[] = "\304rger mit b\366sen B\374bchen ohne 
Augenma\337";
+      static const char expected[] = "\2010\2072rger mit b\2010\2132sen 
B\250\271bchen ohne Augenma\2010\2118";
+      char *result = str_cd_iconveh (input,
+                                     &cdeh_88591_to_gb18030,
+                                     handler);
+      ASSERT (result != NULL);
+      ASSERT (strcmp (result, expected) == 0);
+      free (result);
+    }
+# endif
+
   /* Test conversion from UTF-8 to ISO-8859-1 with no errors.  */
   for (h = 0; h < SIZEOF (handlers); h++)
     {
@@ -688,8 +974,51 @@ main ()
             free (result);
           }
           break;
+        case iconveh_replacement_character:
+          {
+            static const char expected[] = "Rafa\357\277\275 Maszkowski";
+            ASSERT (result != NULL);
+            ASSERT (strcmp (result, expected) == 0);
+            free (result);
+          }
+          break;
+        }
+    }
+
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+  /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ).  */
+  for (h = 0; h < SIZEOF (handlers); h++)
+    {
+      enum iconv_ilseq_handler handler = handlers[h];
+      static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski */
+      char *result = str_cd_iconveh (input,
+                                     &cdeh_ascii_to_gb18030,
+                                     handler);
+      switch (handler)
+        {
+        case iconveh_error:
+          ASSERT (result == NULL && errno == EILSEQ);
+          break;
+        case iconveh_question_mark:
+        case iconveh_escape_sequence:
+          {
+            static const char expected[] = "Rafa? Maszkowski";
+            ASSERT (result != NULL);
+            ASSERT (strcmp (result, expected) == 0);
+            free (result);
+          }
+          break;
+        case iconveh_replacement_character:
+          {
+            static const char expected[] = "Rafa\2041\2447 Maszkowski";
+            ASSERT (result != NULL);
+            ASSERT (strcmp (result, expected) == 0);
+            free (result);
+          }
+          break;
         }
     }
+# endif
 
   /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ.  */
   for (h = 0; h < SIZEOF (handlers); h++)
@@ -705,6 +1034,7 @@ main ()
           ASSERT (result == NULL && errno == EILSEQ);
           break;
         case iconveh_question_mark:
+        case iconveh_replacement_character:
           {
             static const char expected[] = "Costs: 27 ?";
             ASSERT (result != NULL);
@@ -801,6 +1131,7 @@ main ()
                 free (offsets);
               break;
             case iconveh_question_mark:
+            case iconveh_replacement_character:
               {
                 static const char expected[] = "Rafa? Maszkowski";
                 ASSERT (retval == 0);
@@ -870,6 +1201,41 @@ main ()
         }
     }
 
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+  /* Test conversion from ISO-8859-1 to GB18030 with no errors.  */
+  for (h = 0; h < SIZEOF (handlers); h++)
+    {
+      enum iconv_ilseq_handler handler = handlers[h];
+      static const char input[] = "\304rger mit b\366sen B\374bchen ohne 
Augenma\337";
+      static const char expected[] = "\2010\2072rger mit b\2010\2132sen 
B\250\271bchen ohne Augenma\2010\2118";
+      for (o = 0; o < 2; o++)
+        {
+          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
+          char *result = NULL;
+          size_t length = 0;
+          int retval = mem_iconveh (input, strlen (input),
+                                    "ISO-8859-1", "GB18030",
+                                    handler,
+                                    offsets,
+                                    &result, &length);
+          ASSERT (retval == 0);
+          ASSERT (length == strlen (expected));
+          ASSERT (result != NULL && memcmp (result, expected, strlen 
(expected)) == 0);
+          if (o)
+            {
+              for (i = 0; i < 37; i++)
+                ASSERT (offsets[i] == (i < 1 ? i :
+                                       i < 12 ? i + 3 :
+                                       i < 18 ? i + 6 :
+                                       i + 7));
+              ASSERT (offsets[37] == MAGIC);
+              free (offsets);
+            }
+          free (result);
+        }
+    }
+# endif
+
   /* Test conversion from UTF-8 to ISO-8859-1 with no errors.  */
   for (h = 0; h < SIZEOF (handlers); h++)
     {
@@ -931,6 +1297,7 @@ main ()
                 free (offsets);
               break;
             case iconveh_question_mark:
+            case iconveh_replacement_character:
               {
                 static const char expected[] = "Rafa? Maszkowski";
                 ASSERT (retval == 0);
@@ -1023,6 +1390,7 @@ main ()
           ASSERT (result == NULL && errno == EILSEQ);
           break;
         case iconveh_question_mark:
+        case iconveh_replacement_character:
           {
             static const char expected[] = "Rafa? Maszkowski";
             ASSERT (result != NULL);
@@ -1053,6 +1421,20 @@ main ()
       free (result);
     }
 
+# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
+  /* Test conversion from ISO-8859-1 to GB18030 with no errors.  */
+  for (h = 0; h < SIZEOF (handlers); h++)
+    {
+      enum iconv_ilseq_handler handler = handlers[h];
+      static const char input[] = "\304rger mit b\366sen B\374bchen ohne 
Augenma\337";
+      static const char expected[] = "\2010\2072rger mit b\2010\2132sen 
B\250\271bchen ohne Augenma\2010\2118";
+      char *result = str_iconveh (input, "ISO-8859-1", "GB18030", handler);
+      ASSERT (result != NULL);
+      ASSERT (strcmp (result, expected) == 0);
+      free (result);
+    }
+# endif
+
   /* Test conversion from UTF-8 to ISO-8859-1 with no errors.  */
   for (h = 0; h < SIZEOF (handlers); h++)
     {
@@ -1077,6 +1459,7 @@ main ()
           ASSERT (result == NULL && errno == EILSEQ);
           break;
         case iconveh_question_mark:
+        case iconveh_replacement_character:
           {
             static const char expected[] = "Costs: 27 ?";
             ASSERT (result != NULL);
[Prev in Thread]
Current Thread
[Next in Thread]
Re: [striconveh] Error handling and Unicode replacement character, Bruno Haible <=
- Re: [striconveh] Error handling and Unicode replacement character, Bruno Haible, 2022/01/02
- Re: [striconveh] Error handling and Unicode replacement character, Marc Nieper-Wißkirchen, 2022/01/05
Prev by Date: Some generated header files are messed up on Alpine
Next by Date: Re: Some generated header files are messed up on Alpine
Previous by thread: Some generated header files are messed up on Alpine
Next by thread: Re: [striconveh] Error handling and Unicode replacement character
Index(es):
- Date
- Thread