commit 7baf4f407ff3979824a942fbd2226ae57962fd41
Author: Gavin Smith <address@hidden>
Date:   Thu Mar 20 22:52:26 2014 +0000

    Character encoding conversion
    
    This change requires the iconv module to be added from gnulib.
    
    info-utils.c (input_length, convert_encoding_p, file_is_in_utf8)
    (iconv_to_output, iconv_to_utf8): New file-level variables.
    (init_conversion): New function.
    (init_output_stream): Arguments changed.  Call init_conversion.
    (copy_direct, degrade_utf8, copy_converting): New functions.
    (copy_input_to_output): Conditionally convert character encoding
    of copied text.
    (scan_node_contents): Arguments changed.
    (text_buffer_space_left, text_buffer_iconv): New functions.
    
    nodes.c (get_file_character_encoding): New function.
    (info_load_file_internal): Call get_file_character_encoding.
    nodes.h (LOCAL_VARIABLES_LABEL, CHARACTER_ENCODING_LABEL): New
    preprocessor symbols.
    (FILE_BUFFER): New field 'encoding'.

diff --git a/info-utils.c b/info-utils.c
index 6c1f7e6..0b6fee0 100644
--- a/info-utils.c
+++ b/info-utils.c
@@ -23,6 +23,12 @@
 #include "info-utils.h"
 #include "tag.h"
 
+#include <nl_types.h>
+#include <langinfo.h>
+#if HAVE_ICONV
+# include <iconv.h>
+#endif
+
 #if defined (HANDLE_MAN_PAGES)
 #  include "man.h"
 #endif /* HANDLE_MAN_PAGES */
@@ -452,18 +458,100 @@ static int rewrite_p;
 
 static char *input_start, *inptr;
 
+/* Number of bytes in node contents. */
+static size_t input_length;
+
 struct text_buffer output_buf;
 
 static NODE **anchor_to_adjust;
 static int nodestart;
 
+/* Whether we are converting the character encoding of the file. */
+static int convert_encoding_p;
+
+#if HAVE_ICONV
+
+/* Whether text in file is encoded in UTF-8. */
+static int file_is_in_utf8;
+
+/* Used for conversion from file encoding to output encoding. */
+static iconv_t iconv_to_output;
+
+/* Conversion from file encoding to UTF-8. */
+static iconv_t iconv_to_utf8;
+
+#endif /* HAVE_ICONV */
+
+void
+init_conversion (FILE_BUFFER *fb)
+{
+  char *target_encoding;
+
+  convert_encoding_p = 0;
+
+#if !HAVE_ICONV
+  return;
+#else
+  file_is_in_utf8 = 0;
+
+  /* Don't process file if encoding is unknown. */
+  if (!fb->encoding)
+    return;
+
+  /* Read name of character encoding from environment locale */
+  target_encoding = nl_langinfo (CODESET);
+
+  /* Don't convert the contents if the locale
+     uses the same character encoding as the file */
+  if (!strcasecmp(target_encoding, fb->encoding))
+    return;
+
+  /* Check if an iconv conversion from file locale to system
+     locale exists */
+  iconv_to_output = iconv_open (target_encoding, fb->encoding);
+  if (iconv_to_output == (iconv_t) -1)
+    return; /* Return if no conversion function implemented */
+
+  if (   !strcasecmp ("UTF8",  fb->encoding)
+      || !strcasecmp ("UTF-8", fb->encoding))
+    file_is_in_utf8 = 1;
+
+  if (!file_is_in_utf8)
+    {
+      iconv_to_utf8 = iconv_open ("UTF-8", fb->encoding);
+      if (iconv_to_utf8 == (iconv_t) -1)
+        {
+          /* Return if no conversion function implemented */
+          iconv_close (iconv_to_output);
+          return; 
+        }
+    }
+
+  convert_encoding_p = 1;
+  rewrite_p = 1;
+#endif /* HAVE_ICONV */
+}
+
+void close_conversion (void)
+{
+#if HAVE_ICONV
+  if (convert_encoding_p)
+    {
+      iconv_close (iconv_to_output);
+      if (!file_is_in_utf8) iconv_close (iconv_to_utf8);
+    }
+#endif
+}
+
 /* Difference between the number of bytes input in the file and
    bytes output. */
 static long int output_bytes_difference;
 
 static void
-init_output_stream (void)
+init_output_stream (FILE_BUFFER *fb)
 {
+  init_conversion (fb);
+
   if (rewrite_p)
     {
       text_buffer_init (&output_buf);
@@ -471,35 +559,268 @@ init_output_stream (void)
     }
 }
 
+/* Copy bytes from input to output with no encoding conversion. */
+static void
+copy_direct (size_t n)
+{
+  text_buffer_add_string (&output_buf, inptr, n);
+  inptr += n;
+}
+
+/* Read one character at *FROM and write out a sequence
+   of bytes representing that character in ASCII.  *FROM
+   is advanced past the read character. */
+static int
+degrade_utf8 (char **from, size_t *from_left)
+{
+  static struct encoding_replacement
+  {
+    char *from_string;
+    char *to_string;
+  } er[] = {
+    {"\xE2\x80\x98","'"}, /* Opening single quote */
+    {"\xE2\x80\x99","'"}, /* Closing single quote */
+    {"\xE2\x80\x9C","\""},/* Opening double quote */
+    {"\xE2\x80\x9D","\""},/* Closing double quote */
+    {"\xC2\xA9","(C)"},   /* Copyright symbol */
+    {"\xC2\xBB",">>"},    /* Closing double angle brackets */
+    {"\xE2\x86\x92","->"},/* Right arrow */
+
+    {"\xC3\xA0","a`"},   /* Lower case letter a with grave accent */
+    {"\xC3\xA2","a^"},   /* Lower case letter a with circumflex */
+    {"\xC3\xA4","a\""},  /* Lower case letter a with diaeresis */
+    {"\xC3\xA6","ae"},   /* Lower case letter ae ligature */
+    {"\xC3\xA9","e'"},   /* Lower case letter e with acute accent */
+    {"\xC3\xA8","e`"},   /* Lower case letter e with grave accent */
+    {"\xC3\xAA","e^"},   /* Lower case letter e with circumflex */
+    {"\xC3\xAB","e\""},  /* Lower case letter e with diaeresis */
+    {"\xC3\xB6","o\""},  /* Lower case letter o with diaeresis */
+    {"\xC3\xBC","u\""},  /* Lower case letter u with diaeresis */
+    {"\xC3\xB1","n~"},  /* Lower case letter n with tilde */
+    {"\xC3\x87","C,"},  /* Upper case letter C with cedilla */
+    {"\xC3\xA7","c,"},  /* Lower case letter c with cedilla */
+    {"\xC3\x9f","ss"},  /* Lower case letter sharp s */
+    
+    {0, 0}
+  };
+
+  struct encoding_replacement *erp;
+
+  for (erp = er; erp->from_string != 0; erp++)
+    {
+      /* Avoid reading past end of input. */
+      int width = strlen (erp->from_string);
+      if (width > *from_left)
+        continue;
+
+      if (!strncmp (erp->from_string, *from, width))
+        {
+          text_buffer_add_string (&output_buf, erp->to_string,
+                                  strlen(erp->to_string));
+          *from += width;
+          *from_left -= width;
+          return 1;
+        }
+    }
+
+  /* Failing this, just print a question mark.  Maybe we should use SUB
+     (^Z) (ASCII substitute character code) instead. */
+  //text_buffer_add_string (&output_buf, inptr, 1);
+
+  //text_buffer_add_string (&output_buf, "\x1A", 1);
+  text_buffer_add_string (&output_buf, "?", 1);
+
+  /* Ideally we would advance one UTF-8 character.  This would
+     require knowing its length in bytes. */
+  (*from)++;
+  (*from_left)--;
+
+  return 0;
+}
+
+/* Convert N bytes from input to output encoding and write to
+   output buffer.  Return number of bytes over N written. */
+static int
+copy_converting (size_t n)
+{
+#if !HAVE_ICONV
+  return 0;
+#else
+  size_t bytes_left;
+  int extra_at_end;
+  size_t iconv_ret;
+  long output_start;
+
+  size_t utf8_char_free; 
+  char utf8_char[4]; /* Maximum 4 bytes in a UTF-8 character */
+  char *utf8_char_ptr;
+  int i;
+  
+  /* Use n as an estimate of how many bytes will be required
+     in target encoding. */
+  text_buffer_alloc (&output_buf, n);
+
+  output_start = text_buffer_off (&output_buf);
+  bytes_left = n;
+  extra_at_end = 0;
+  while (bytes_left >= 0)
+    {
+      iconv_ret = text_buffer_iconv (&output_buf, iconv_to_output,
+                                     &inptr, &bytes_left);
+
+      if (iconv_ret != (size_t) -1)
+        /* Success: all of input converted. */
+        break;
+
+      /* There's been an error while converting. */
+      switch (errno)
+        {
+        case E2BIG:
+          /* Ran out of space in output buffer.  Allocate more
+             and try again. */
+          text_buffer_alloc (&output_buf, n);
+          continue;
+        case EILSEQ:
+          /* Byte sequence in input buffer not recognized. */
+          break;
+        case EINVAL:
+          /* Incomplete byte sequence at end of input buffer.  Try to read
+             more. */
+
+          /* input_length - 2 is offset of last-but-one byte within input.
+             This checks if there is at least one more byte within node
+             contents. */
+          if (inptr - input_start + (bytes_left - 1) <= input_length - 2)
+            {
+              bytes_left++;
+              extra_at_end++;
+            }
+          else
+            {
+              copy_direct (bytes_left);
+              bytes_left = 0;
+            }
+          break;
+        default: /* Unknown error - abort */
+          info_error (_("Error converting file character encoding."));
+
+          /* Skip past current input and hope we don't get an
+             error next time. */
+          inptr += bytes_left;
+          return 0;
+        }
+
+      /* Degrade to ASCII. */
+      
+      if (file_is_in_utf8)
+        {
+          degrade_utf8 (&inptr, &bytes_left);
+          continue;     
+        }
+
+      /* If file is not in UTF-8, we degrade to ASCII in two steps:
+         first convert the character to UTF-8, then look up a replacement
+         string.  Note that mixing iconv_to_output and iconv_to_utf8
+         on the same input may not work well if the input encoding
+         is stateful. */
+
+      /* We want to read exactly one character.  Do this by
+         restricting size of output buffer. */
+      utf8_char_ptr = utf8_char;
+      for (i = 1; i <= 4; i++)
+        {
+          utf8_char_free = i;
+          iconv_ret = iconv (iconv_to_utf8, &inptr, &bytes_left,
+                             &utf8_char_ptr, &utf8_char_free);
+          /* If we managed to write a character: */
+          if (utf8_char_ptr > utf8_char) break;
+        }
+
+      /* errno == E2BIG if iconv ran out of output buffer,
+         which is expected. */
+      if (iconv_ret == (size_t) -1 && errno != E2BIG)
+        /* Character is not recognized.  Copy a single byte. */
+        copy_direct (1);
+      else
+        {
+          utf8_char_ptr = utf8_char;
+          /* i is width of UTF-8 character */
+          degrade_utf8 (&utf8_char_ptr, &i);
+        }
+    }
+
+  /* Must cast because the difference between unsigned size_t is always
+     positive. */
+  output_bytes_difference +=
+    (signed long) n
+    - (signed long) (text_buffer_off (&output_buf) - output_start);
+
+  return extra_at_end;
+#endif /* HAVE_ICONV */
+}
+
+/* Copy text from input node contents, possibly converting the
+   character encoding and adjusting anchor offsets at the same time. */
 static void
 copy_input_to_output (size_t n)
 {
   if (rewrite_p)
     {
-      text_buffer_add_string (&output_buf, inptr, n);
-      inptr += n;
+      size_t bytes_left;
 
-      /* Check if we have gone past any anchors and
-         adjust with output_bytes_difference. */
-      if (anchor_to_adjust)
+      bytes_left = n;
+      while (bytes_left > 0)
         {
-          while ((*anchor_to_adjust)->nodestart - nodestart
-                 <= inptr - input_start)
+          if (!convert_encoding_p)
             {
-              (*anchor_to_adjust)->nodestart -= output_bytes_difference;
-              anchor_to_adjust++;
-              if (!*anchor_to_adjust || (*anchor_to_adjust)->nodelen != 0)
+              copy_direct (bytes_left);
+              bytes_left = 0;
+            }
+          else
+            {
+              size_t bytes_to_convert;
+              size_t extra_written;
+
+              if (anchor_to_adjust)
                 {
-                  anchor_to_adjust = 0;
-                  break;
+                  char *first_anchor =
+                     input_start + (*anchor_to_adjust)->nodestart;
+
+                  /* If there is an anchor in the input: */
+                  if (first_anchor <= inptr + bytes_left)
+                    /* Convert enough to pass the first anchor in input. */
+                    bytes_to_convert = first_anchor - inptr + 1;
+                  else
+                    bytes_to_convert = bytes_left;
                 }
+              else
+                bytes_to_convert = bytes_left;
+
+              /* copy_converting may read more than bytes_to_convert
+                 bytes its input ends in an incomplete byte sequence. */
+              extra_written = copy_converting (bytes_to_convert);
+
+              bytes_left -= bytes_to_convert + extra_written;
             }
+
+          /* Check if we have gone past any anchors and
+             adjust with output_bytes_difference. */
+          if (anchor_to_adjust)
+            while ((*anchor_to_adjust)->nodestart - nodestart
+                   <= inptr - input_start)
+              {
+                (*anchor_to_adjust)->nodestart -= output_bytes_difference;
+                anchor_to_adjust++;
+                if (!*anchor_to_adjust || (*anchor_to_adjust)->nodelen != 0)
+                  {
+                    anchor_to_adjust = 0;
+                    break;
+                  }
+              }
         }
     }
   else
-    {
-      inptr += n;
-    }
+    inptr += n;
 }
 
 static void
@@ -630,16 +951,16 @@ colon_after_newline (char *nodeptr)
     return -1;
 }
 
-/* Remove syntax from (*NODE)->contents and build list of references
+/* Remove syntax from FB->tags[TAG]->contents and build list of references
    in node.  Adjust anchors in tag table that point into node text.*/
 void
-scan_node_contents (NODE **tag)
+scan_node_contents (FILE_BUFFER *fb, int tag)
 {
   SEARCH_BINDING s;
   char *search_string;
 
   int found_menu_entry, in_index = 0;
-  NODE *node = *tag;
+  NODE *node = fb->tags[tag];
 
   REFERENCE **refs = NULL;
   size_t refs_index = 0, refs_slots = 0;
@@ -654,11 +975,10 @@ scan_node_contents (NODE **tag)
   else
     rewrite_p = 0;
 
-  if (rewrite_p)
-    init_output_stream ();
+  init_output_stream (fb);
 
   /* Set anchor_to_adjust to first anchor in node, if any. */
-  anchor_to_adjust = tag + 1;
+  anchor_to_adjust = &fb->tags[tag + 1];
   if (!*anchor_to_adjust)
     anchor_to_adjust = 0;
   else if (*anchor_to_adjust && (*anchor_to_adjust)->nodelen != 0)
@@ -675,6 +995,7 @@ scan_node_contents (NODE **tag)
      all other assignment should be done with the helper functions above. */
   inptr = node->contents;
   input_start = node->contents;
+  input_length = node->nodelen;
   nodestart = node->nodestart;
 
   parse_top_node_line (node);
@@ -995,7 +1316,7 @@ search_again:
               if (!rewrite_p)
                 {
                   rewrite_p = 1;
-                  init_output_stream ();
+                  init_output_stream (fb);
 
                   /* Put inptr back to start so that
                      copy_input_to_output below gets all
@@ -1045,6 +1366,9 @@ search_again:
   if (rewrite_p)
     text_buffer_add_string (&output_buf, "\0", 1);
 
+  /* Free resources used in character encoding conversion. */
+  close_conversion ();
+
   node->references = refs;
 
   if (rewrite_p)
@@ -1285,6 +1609,7 @@ text_buffer_vprintf (struct text_buffer *buf, const char *format, va_list ap)
   return n;
 }
 
+/* Make sure there are LEN free bytes at end of BUF. */
 void
 text_buffer_alloc (struct text_buffer *buf, size_t len)
 {
@@ -1297,6 +1622,39 @@ text_buffer_alloc (struct text_buffer *buf, size_t len)
     }
 }
 
+/* Return number of bytes that can be written to text buffer without
+   reallocating the text buffer. */
+size_t
+text_buffer_space_left (struct text_buffer *buf)
+{
+  /* buf->size is the offset of the first byte after the allocated space.
+     buf->off is the offset of the first byte to be written to. */
+  return buf->size - buf->off;
+}
+
+#if HAVE_ICONV
+
+/* Run iconv using text buffer as output buffer. */
+size_t
+text_buffer_iconv (struct text_buffer *buf, iconv_t iconv_state,
+                   char **inbuf, size_t *inbytesleft)
+{
+  size_t out_bytes_left;
+  char *outptr;
+  size_t iconv_ret;
+
+  outptr = text_buffer_base (buf) + text_buffer_off (buf);
+  out_bytes_left = text_buffer_space_left (buf);
+  iconv_ret = iconv (iconv_to_output, inbuf, inbytesleft,
+                     &outptr, &out_bytes_left);
+
+  text_buffer_off (buf) = outptr - text_buffer_base (buf);    
+
+  return iconv_ret;
+}
+
+#endif /* HAVE_ICONV */
+
 size_t
 text_buffer_add_string (struct text_buffer *buf, const char *str, size_t len)
 {
diff --git a/info-utils.h b/info-utils.h
index 60825bf..88c2e9f 100644
--- a/info-utils.h
+++ b/info-utils.h
@@ -26,6 +26,10 @@
 #include "window.h"
 #include "search.h"
 
+#if HAVE_ICONV
+# include <iconv.h>
+#endif
+
 /* When non-zero, various display and input functions handle ISO Latin
    character sets correctly. */
 extern int ISO_Latin_p;
@@ -56,10 +60,8 @@ extern char *info_parsed_nodename;
 */ 
 int info_parse_node (char *string, int flag);
 
-/* NODE points to a tag table entry. Scan (*NODE)->contents for references
-   and set (*NODE)->references. If preprocess_nodes_p=On, remove syntax
-   from NODE->contents. Adjust anchors in tag table in this node. */
-void scan_node_contents (NODE **node);
+/* Scan contents of FB->tags[TAG]. */
+void scan_node_contents (FILE_BUFFER *fb, int tag);
 
 /* Get the entry associated with LABEL in REFERENCES.  Return a pointer to
    the reference if found, or NULL. */
@@ -120,6 +122,11 @@ void text_buffer_free (struct text_buffer *buf);
 void text_buffer_alloc (struct text_buffer *buf, size_t len);
 size_t text_buffer_vprintf (struct text_buffer *buf, const char *format,
 			    va_list ap);
+size_t text_buffer_space_left (struct text_buffer *buf);
+#if HAVE_ICONV
+size_t text_buffer_iconv (struct text_buffer *buf, iconv_t iconv_state,
+                          char **inbuf, size_t *inbytesleft);
+#endif
 size_t text_buffer_add_string (struct text_buffer *buf, const char *str,
 			       size_t len);
 size_t text_buffer_fill (struct text_buffer *buf, int c, size_t len);
diff --git a/nodes.c b/nodes.c
index 8541c8f..ce3df00 100644
--- a/nodes.c
+++ b/nodes.c
@@ -42,6 +42,7 @@ static void get_tags_of_indirect_tags_table (FILE_BUFFER *file_buffer,
     SEARCH_BINDING *indirect_binding, SEARCH_BINDING *tags_binding);
 static void info_reload_file_buffer_contents (FILE_BUFFER *fb);
 static char *adjust_nodestart (NODE *node, int min, int max);
+static void get_file_character_encoding (FILE_BUFFER *fb);
 static FILE_BUFFER *info_load_file_internal (char *filename, int get_tags);
 static FILE_BUFFER *info_find_file_internal (char *filename, int get_tags);
 static NODE *info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer,
@@ -340,6 +341,53 @@ info_find_file_internal (char *filename, int get_tags)
   return file_buffer;
 }
 
+/* Look for local variables section in FB and set encoding */
+static void
+get_file_character_encoding (FILE_BUFFER *fb)
+{
+  SEARCH_BINDING binding;
+  long position;
+
+  long int enc_start, enc_end;
+  char *enc_string;
+
+  char **encoding_name;
+
+  /* See if there is a local variables section in this info file. */
+  binding.buffer = fb->contents;
+  binding.start = fb->filesize;
+  binding.end = binding.start - 1000;
+  if (binding.end < 0)
+    binding.end = 0;
+  binding.flags = S_FoldCase;
+
+  /* Null means the encoding is unknown. */
+  fb->encoding = 0;
+
+  if (search_backward (LOCAL_VARIABLES_LABEL, &binding, &position)
+      != search_success)
+    return;
+
+  binding.start = position;
+  binding.end = fb->filesize;
+
+  if (search_forward (CHARACTER_ENCODING_LABEL, &binding, &enc_start)
+      != search_success)
+    return;
+
+  enc_start += strlen(CHARACTER_ENCODING_LABEL); /* Skip to after "coding:" */
+  enc_start += skip_whitespace(fb->contents + enc_start);
+  binding.start = enc_start;
+
+  search_forward ("\n", &binding, &enc_end);
+
+  enc_string = xmalloc (enc_end - enc_start + 1);
+  strncpy (enc_string, fb->contents + enc_start, enc_end - enc_start);
+  enc_string[enc_end - enc_start] = '\0';
+
+  fb->encoding = enc_string;
+}
+
 /* The workhorse function for info_load_file ().  Non-zero second argument
    says to build a list of tags (or nodes) for this file.  This is the
    default behaviour when info_load_file () is called, but it is not
@@ -412,6 +460,9 @@ info_load_file_internal (char *filename, int get_tags)
   if (compressed)
     file_buffer->flags |= N_IsCompressed;
   
+  /* Find encoding of file, if set */
+  get_file_character_encoding (file_buffer);
+
   /* If requested, build the tags and nodes for this file buffer. */
   if (get_tags)
     build_tags_and_nodes (file_buffer);
@@ -1058,7 +1109,7 @@ info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer, char *nodename)
 
             /* Read locations of references in node and similar. Rewrite
                node from tag->contents if preprocess_nodes=On. */
-            scan_node_contents (&file_buffer->tags[i]);
+            scan_node_contents (file_buffer, i);
 
             *node = *tag;
 	  }
diff --git a/nodes.h b/nodes.h
index 5f0a23d..ac0d432 100644
--- a/nodes.h
+++ b/nodes.h
@@ -98,6 +98,9 @@ typedef struct {
 #define TAGS_TABLE_BEG_LABEL            "Tag Table:\n"
 #define INDIRECT_TAGS_TABLE_LABEL       "Indirect:\n"
 #define TAGS_TABLE_IS_INDIRECT_LABEL    "(Indirect)"
+#define LOCAL_VARIABLES_LABEL           "Local Variables"
+#define CHARACTER_ENCODING_LABEL        "coding:"
+
 
 /* Character constants. */
 #define INFO_COOKIE '\037'
@@ -121,6 +124,7 @@ typedef struct {
   NODE **tags;                  /* If non-null, the indirect tags table. */
   size_t tags_slots;            /* Number of slots allocated for TAGS. */
   int flags;                    /* Various flags.  Mimics of N_* flags. */
+  char *encoding;               /* Name of character encoding of file. */
 } FILE_BUFFER;
 
 /* Externally visible functions.  */