bug-texinfo
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Using iconv in stand-alone info


From: Eli Zaretskii
Subject: Re: Using iconv in stand-alone info
Date: Thu, 24 Dec 2015 19:18:42 +0200

> Date: Wed, 23 Dec 2015 19:15:18 +0000
> From: Gavin Smith <address@hidden>
> Cc: Texinfo <address@hidden>
> 
> >       /* We want to read exactly one character.  Do this by
> >          restricting size of output buffer. */
> >       utf8_char_ptr = utf8_char;
> >       for (i = 1; i <= 4; i++)
> >         {
> >           utf8_char_free = i;
> >           iconv_ret = iconv (iconv_to_utf8, &inptr, &bytes_left,
> >                              &utf8_char_ptr, &utf8_char_free);
> >           /* If we managed to write a character: */
> >           if (utf8_char_ptr > utf8_char) break;
> >         }
> >
> > Why cannot you restrict the size of input instead?
> 
> I really can't remember for sure (it was like that in the earliest
> version of the code I posted to this list in Feb. 2014). The only
> possible reason I've been able to think of just now is that the
> maximum length in bytes required in the input to get an output
> character is unknown (the input character could be preceded by a
> locking shift sequence of a few bytes, for example), but for the
> output it's known that no UTF-8 character can be more than 4 bytes.

OK, makes sense.

Here's what I came up with, please see if it looks better now.

Having played with this code, I must say that I feel it's based on
somewhat fragile assumptions whose validity is not clear to me.

We have here a loop that converts between 2 encodings, the files'
encoding and the locale encoding needed to display the text.  Whenever
this conversion fails, presumably because some character cannot be
expressed in the target encoding, we convert it to UTF-8 and then
re-express it in ASCII art.  Then we continue the loop, once again
trying to convert the file encoding into the locale encoding -- until
it fails again.  For each of the conversions, we use a separate
iconv_t state variable, but we use the same input buffer pointer and
the same variable to track the number of bytes left unconverted.  So
there's a ping-pong between 2 separate conversions, and the assumption
seems to be that each conversion advances the input pointer and the
bytes-left variable according to what it produced.  In particular,
there's clearly an assumption that if no characters were produced, the
input pointer is left at its original value, and if N characters were
produced, the input pointer advances across all the bytes
corresponding to those characters, and no more.

But the libiconv documentation never promises anything like that.  In
some specific situations, the documentation says where the input
pointer is left, but the rest of the situations are simply left
undocumented.  And in my limited testing I clearly saw that sometimes
the input pointer moves across more bytes than were needed to produce
the output.

So it looks to me that by switching between two different conversions
we play with fire: each conversion keeps its state consistent with the
input pointer, but it doesn't expect us to pass the pointer to some
other conversion.

I tried to work around these hidden issues as much as I could, but I'm
not sure the result is reliable enough.  I think we should seek advice
from some expert, like Bruno Haible, because I'm quite sure what Info
does is quite unique.

Anyway, here's the patch:

--- info/info-utils.c~0 2015-12-19 18:47:40.000000000 +0200
+++ info/info-utils.c   2015-12-24 18:59:04.436075700 +0200
@@ -830,14 +830,14 @@ copy_converting (long n)
 #if !HAVE_ICONV
   return 0;
 #else
-  size_t bytes_left;
+  size_t bytes_left, orig_bytes_left;
   int extra_at_end;
   size_t iconv_ret;
   long output_start;
 
   size_t utf8_char_free; 
   char utf8_char[4]; /* Maximum 4 bytes in a UTF-8 character */
-  char *utf8_char_ptr;
+  char *utf8_char_ptr, *orig_inptr;
   size_t i;
   
   /* Use n as an estimate of how many bytes will be required
@@ -850,9 +850,15 @@ copy_converting (long n)
   while (bytes_left >= 0)
     {
       iconv_ret = text_buffer_iconv (&output_buf, iconv_to_output,
-                                     &inptr, &bytes_left);
+                                     (ICONV_CONST char **)&inptr, &bytes_left);
 
-      if (iconv_ret != (size_t) -1)
+      /* Make sure libiconv flushes out the last converted character.
+        This is required when the conversion is stateful, in which
+        case libiconv might not output the last charcater, waiting to
+        see whether it should be combined with the next one.  */
+      if (iconv_ret != (size_t) -1
+         && text_buffer_iconv (&output_buf, iconv_to_output,
+                               NULL, NULL) != (size_t) -1)
         /* Success: all of input converted. */
         break;
 
@@ -912,25 +918,47 @@ copy_converting (long n)
       /* We want to read exactly one character.  Do this by
          restricting size of output buffer. */
       utf8_char_ptr = utf8_char;
+      orig_inptr = inptr;
+      orig_bytes_left = bytes_left;
       for (i = 1; i <= 4; i++)
         {
           utf8_char_free = i;
-          iconv_ret = iconv (iconv_to_utf8, &inptr, &bytes_left,
-                             &utf8_char_ptr, &utf8_char_free);
-          /* If we managed to write a character: */
-          if (utf8_char_ptr > utf8_char) break;
+          errno = 0;
+          iconv_ret = iconv (iconv_to_utf8, (ICONV_CONST char **)&inptr,
+                             &bytes_left, &utf8_char_ptr, &utf8_char_free);
+          if ((iconv_ret == (size_t) -1 && errno != E2BIG)
+              /* If we managed to convert a character: */
+              || utf8_char_ptr > utf8_char)
+            break;
         }
 
       /* errno == E2BIG if iconv ran out of output buffer,
          which is expected. */
       if (iconv_ret == (size_t) -1 && errno != E2BIG)
-        /* Character is not recognized.  Copy a single byte. */
-        copy_direct (1);
+       {
+         /* Character is not recognized.  Copy a single byte.  */
+         inptr = orig_inptr;   /* iconv might have incremented inptr  */
+         copy_direct (1);
+         bytes_left = orig_bytes_left - 1;
+       }
       else
         {
           utf8_char_ptr = utf8_char;
           /* i is width of UTF-8 character */
           degrade_utf8 (&utf8_char_ptr, &i);
+         /* If we are done, make sure iconv flushes the last character.  */
+         if (bytes_left <= 0)
+           {
+             utf8_char_ptr = utf8_char;
+             i = 4;
+             iconv (iconv_to_utf8, NULL, NULL,
+                    &utf8_char_ptr, &utf8_char_free);
+             if (utf8_char_ptr > utf8_char)
+               {
+                 utf8_char_ptr = utf8_char;
+                 degrade_utf8 (&utf8_char_ptr, &i);
+               }
+           }
         }
     }
 
@@ -1925,7 +1953,7 @@ text_buffer_space_left (struct text_buff
 /* Run iconv using text buffer as output buffer. */
 size_t
 text_buffer_iconv (struct text_buffer *buf, iconv_t iconv_state,
-                   char **inbuf, size_t *inbytesleft)
+                   ICONV_CONST char **inbuf, size_t *inbytesleft)
 {
   size_t out_bytes_left;
   char *outptr;
@@ -1933,7 +1961,7 @@ text_buffer_iconv (struct text_buffer *b
 
   outptr = text_buffer_base (buf) + text_buffer_off (buf);
   out_bytes_left = text_buffer_space_left (buf);
-  iconv_ret = iconv (iconv_to_output, inbuf, inbytesleft,
+  iconv_ret = iconv (iconv_state, inbuf, inbytesleft,
                      &outptr, &out_bytes_left);
 
   text_buffer_off (buf) = outptr - text_buffer_base (buf);    
--- info/info-utils.h~0 2015-08-14 22:11:33.000000000 +0300
+++ info/info-utils.h   2015-12-24 10:56:07.258480000 +0200
@@ -123,7 +123,7 @@ size_t text_buffer_vprintf (struct text_
 size_t text_buffer_space_left (struct text_buffer *buf);
 #if HAVE_ICONV
 size_t text_buffer_iconv (struct text_buffer *buf, iconv_t iconv_state,
-                          char **inbuf, size_t *inbytesleft);
+                          ICONV_CONST char **inbuf, size_t *inbytesleft);
 #endif
 size_t text_buffer_add_string (struct text_buffer *buf, const char *str,
                               size_t len);



reply via email to

[Prev in Thread] Current Thread [Next in Thread]