commit 7baf4f407ff3979824a942fbd2226ae57962fd41
Author: Gavin Smith
Date: Thu Mar 20 22:52:26 2014 +0000
Character encoding conversion
This change requires the iconv module to be added from gnulib.
info-utils.c (input_length, convert_encoding_p, file_is_in_utf8)
(iconv_to_output, iconv_to_utf8): New file-level variables.
(init_conversion): New function.
(init_output_stream): Arguments changed. Call init_conversion.
(copy_direct, degrade_utf8, copy_converting): New functions.
(copy_input_to_output): Conditionally convert character encoding
of copied text.
(scan_node_contents): Arguments changed.
(text_buffer_space_left, text_buffer_iconv): New functions.
nodes.c (get_file_character_encoding): New function.
(info_load_file_internal): Call get_file_character_encoding.
nodes.h (LOCAL_VARIABLES_LABEL, CHARACTER_ENCODING_LABEL): New
preprocessor symbols.
(FILE_BUFFER): New field 'encoding'.
diff --git a/info-utils.c b/info-utils.c
index 6c1f7e6..0b6fee0 100644
--- a/info-utils.c
+++ b/info-utils.c
@@ -23,6 +23,12 @@
#include "info-utils.h"
#include "tag.h"
+#include
+#include
+#if HAVE_ICONV
+# include
+#endif
+
#if defined (HANDLE_MAN_PAGES)
# include "man.h"
#endif /* HANDLE_MAN_PAGES */
@@ -452,18 +458,100 @@ static int rewrite_p;
static char *input_start, *inptr;
+/* Number of bytes in node contents. */
+static size_t input_length;
+
struct text_buffer output_buf;
static NODE **anchor_to_adjust;
static int nodestart;
+/* Whether we are converting the character encoding of the file. */
+static int convert_encoding_p;
+
+#if HAVE_ICONV
+
+/* Whether text in file is encoded in UTF-8. */
+static int file_is_in_utf8;
+
+/* Used for conversion from file encoding to output encoding. */
+static iconv_t iconv_to_output;
+
+/* Conversion from file encoding to UTF-8. */
+static iconv_t iconv_to_utf8;
+
+#endif /* HAVE_ICONV */
+
+void
+init_conversion (FILE_BUFFER *fb)
+{
+ char *target_encoding;
+
+ convert_encoding_p = 0;
+
+#if !HAVE_ICONV
+ return;
+#else
+ file_is_in_utf8 = 0;
+
+ /* Don't process file if encoding is unknown. */
+ if (!fb->encoding)
+ return;
+
+ /* Read name of character encoding from environment locale */
+ target_encoding = nl_langinfo (CODESET);
+
+ /* Don't convert the contents if the locale
+ uses the same character encoding as the file */
+ if (!strcasecmp(target_encoding, fb->encoding))
+ return;
+
+ /* Check if an iconv conversion from file locale to system
+ locale exists */
+ iconv_to_output = iconv_open (target_encoding, fb->encoding);
+ if (iconv_to_output == (iconv_t) -1)
+ return; /* Return if no conversion function implemented */
+
+ if ( !strcasecmp ("UTF8", fb->encoding)
+ || !strcasecmp ("UTF-8", fb->encoding))
+ file_is_in_utf8 = 1;
+
+ if (!file_is_in_utf8)
+ {
+ iconv_to_utf8 = iconv_open ("UTF-8", fb->encoding);
+ if (iconv_to_utf8 == (iconv_t) -1)
+ {
+ /* Return if no conversion function implemented */
+ iconv_close (iconv_to_output);
+ return;
+ }
+ }
+
+ convert_encoding_p = 1;
+ rewrite_p = 1;
+#endif /* HAVE_ICONV */
+}
+
+void close_conversion (void)
+{
+#if HAVE_ICONV
+ if (convert_encoding_p)
+ {
+ iconv_close (iconv_to_output);
+ if (!file_is_in_utf8) iconv_close (iconv_to_utf8);
+ }
+#endif
+}
+
/* Difference between the number of bytes input in the file and
bytes output. */
static long int output_bytes_difference;
static void
-init_output_stream (void)
+init_output_stream (FILE_BUFFER *fb)
{
+ init_conversion (fb);
+
if (rewrite_p)
{
text_buffer_init (&output_buf);
@@ -471,35 +559,268 @@ init_output_stream (void)
}
}
+/* Copy bytes from input to output with no encoding conversion. */
+static void
+copy_direct (size_t n)
+{
+ text_buffer_add_string (&output_buf, inptr, n);
+ inptr += n;
+}
+
+/* Read one character at *FROM and write out a sequence
+ of bytes representing that character in ASCII. *FROM
+ is advanced past the read character. */
+static int
+degrade_utf8 (char **from, size_t *from_left)
+{
+ static struct encoding_replacement
+ {
+ char *from_string;
+ char *to_string;
+ } er[] = {
+ {"\xE2\x80\x98","'"}, /* Opening single quote */
+ {"\xE2\x80\x99","'"}, /* Closing single quote */
+ {"\xE2\x80\x9C","\""},/* Opening double quote */
+ {"\xE2\x80\x9D","\""},/* Closing double quote */
+ {"\xC2\xA9","(C)"}, /* Copyright symbol */
+ {"\xC2\xBB",">>"}, /* Closing double angle brackets */
+ {"\xE2\x86\x92","->"},/* Right arrow */
+
+ {"\xC3\xA0","a`"}, /* Lower case letter a with grave accent */
+ {"\xC3\xA2","a^"}, /* Lower case letter a with circumflex */
+ {"\xC3\xA4","a\""}, /* Lower case letter a with diaeresis */
+ {"\xC3\xA6","ae"}, /* Lower case letter ae ligature */
+ {"\xC3\xA9","e'"}, /* Lower case letter e with acute accent */
+ {"\xC3\xA8","e`"}, /* Lower case letter e with grave accent */
+ {"\xC3\xAA","e^"}, /* Lower case letter e with circumflex */
+ {"\xC3\xAB","e\""}, /* Lower case letter e with diaeresis */
+ {"\xC3\xB6","o\""}, /* Lower case letter o with diaeresis */
+ {"\xC3\xBC","u\""}, /* Lower case letter u with diaeresis */
+ {"\xC3\xB1","n~"}, /* Lower case letter n with tilde */
+ {"\xC3\x87","C,"}, /* Upper case letter C with cedilla */
+ {"\xC3\xA7","c,"}, /* Lower case letter c with cedilla */
+ {"\xC3\x9f","ss"}, /* Lower case letter sharp s */
+
+ {0, 0}
+ };
+
+ struct encoding_replacement *erp;
+
+ for (erp = er; erp->from_string != 0; erp++)
+ {
+ /* Avoid reading past end of input. */
+ int width = strlen (erp->from_string);
+ if (width > *from_left)
+ continue;
+
+ if (!strncmp (erp->from_string, *from, width))
+ {
+ text_buffer_add_string (&output_buf, erp->to_string,
+ strlen(erp->to_string));
+ *from += width;
+ *from_left -= width;
+ return 1;
+ }
+ }
+
+ /* Failing this, just print a question mark. Maybe we should use SUB
+ (^Z) (ASCII substitute character code) instead. */
+ //text_buffer_add_string (&output_buf, inptr, 1);
+
+ //text_buffer_add_string (&output_buf, "\x1A", 1);
+ text_buffer_add_string (&output_buf, "?", 1);
+
+ /* Ideally we would advance one UTF-8 character. This would
+ require knowing its length in bytes. */
+ (*from)++;
+ (*from_left)--;
+
+ return 0;
+}
+
+/* Convert N bytes from input to output encoding and write to
+ output buffer. Return number of bytes over N written. */
+static int
+copy_converting (size_t n)
+{
+#if !HAVE_ICONV
+ return 0;
+#else
+ size_t bytes_left;
+ int extra_at_end;
+ size_t iconv_ret;
+ long output_start;
+
+ size_t utf8_char_free;
+ char utf8_char[4]; /* Maximum 4 bytes in a UTF-8 character */
+ char *utf8_char_ptr;
+ int i;
+
+ /* Use n as an estimate of how many bytes will be required
+ in target encoding. */
+ text_buffer_alloc (&output_buf, n);
+
+ output_start = text_buffer_off (&output_buf);
+ bytes_left = n;
+ extra_at_end = 0;
+ while (bytes_left >= 0)
+ {
+ iconv_ret = text_buffer_iconv (&output_buf, iconv_to_output,
+ &inptr, &bytes_left);
+
+ if (iconv_ret != (size_t) -1)
+ /* Success: all of input converted. */
+ break;
+
+ /* There's been an error while converting. */
+ switch (errno)
+ {
+ case E2BIG:
+ /* Ran out of space in output buffer. Allocate more
+ and try again. */
+ text_buffer_alloc (&output_buf, n);
+ continue;
+ case EILSEQ:
+ /* Byte sequence in input buffer not recognized. */
+ break;
+ case EINVAL:
+ /* Incomplete byte sequence at end of input buffer. Try to read
+ more. */
+
+ /* input_length - 2 is offset of last-but-one byte within input.
+ This checks if there is at least one more byte within node
+ contents. */
+ if (inptr - input_start + (bytes_left - 1) <= input_length - 2)
+ {
+ bytes_left++;
+ extra_at_end++;
+ }
+ else
+ {
+ copy_direct (bytes_left);
+ bytes_left = 0;
+ }
+ break;
+ default: /* Unknown error - abort */
+ info_error (_("Error converting file character encoding."));
+
+ /* Skip past current input and hope we don't get an
+ error next time. */
+ inptr += bytes_left;
+ return 0;
+ }
+
+ /* Degrade to ASCII. */
+
+ if (file_is_in_utf8)
+ {
+ degrade_utf8 (&inptr, &bytes_left);
+ continue;
+ }
+
+ /* If file is not in UTF-8, we degrade to ASCII in two steps:
+ first convert the character to UTF-8, then look up a replacement
+ string. Note that mixing iconv_to_output and iconv_to_utf8
+ on the same input may not work well if the input encoding
+ is stateful. */
+
+ /* We want to read exactly one character. Do this by
+ restricting size of output buffer. */
+ utf8_char_ptr = utf8_char;
+ for (i = 1; i <= 4; i++)
+ {
+ utf8_char_free = i;
+ iconv_ret = iconv (iconv_to_utf8, &inptr, &bytes_left,
+ &utf8_char_ptr, &utf8_char_free);
+ /* If we managed to write a character: */
+ if (utf8_char_ptr > utf8_char) break;
+ }
+
+ /* errno == E2BIG if iconv ran out of output buffer,
+ which is expected. */
+ if (iconv_ret == (size_t) -1 && errno != E2BIG)
+ /* Character is not recognized. Copy a single byte. */
+ copy_direct (1);
+ else
+ {
+ utf8_char_ptr = utf8_char;
+ /* i is width of UTF-8 character */
+ degrade_utf8 (&utf8_char_ptr, &i);
+ }
+ }
+
+ /* Must cast because the difference between unsigned size_t is always
+ positive. */
+ output_bytes_difference +=
+ (signed long) n
+ - (signed long) (text_buffer_off (&output_buf) - output_start);
+
+ return extra_at_end;
+#endif /* HAVE_ICONV */
+}
+
+/* Copy text from input node contents, possibly converting the
+ character encoding and adjusting anchor offsets at the same time. */
static void
copy_input_to_output (size_t n)
{
if (rewrite_p)
{
- text_buffer_add_string (&output_buf, inptr, n);
- inptr += n;
+ size_t bytes_left;
- /* Check if we have gone past any anchors and
- adjust with output_bytes_difference. */
- if (anchor_to_adjust)
+ bytes_left = n;
+ while (bytes_left > 0)
{
- while ((*anchor_to_adjust)->nodestart - nodestart
- <= inptr - input_start)
+ if (!convert_encoding_p)
{
- (*anchor_to_adjust)->nodestart -= output_bytes_difference;
- anchor_to_adjust++;
- if (!*anchor_to_adjust || (*anchor_to_adjust)->nodelen != 0)
+ copy_direct (bytes_left);
+ bytes_left = 0;
+ }
+ else
+ {
+ size_t bytes_to_convert;
+ size_t extra_written;
+
+ if (anchor_to_adjust)
{
- anchor_to_adjust = 0;
- break;
+ char *first_anchor =
+ input_start + (*anchor_to_adjust)->nodestart;
+
+ /* If there is an anchor in the input: */
+ if (first_anchor <= inptr + bytes_left)
+ /* Convert enough to pass the first anchor in input. */
+ bytes_to_convert = first_anchor - inptr + 1;
+ else
+ bytes_to_convert = bytes_left;
}
+ else
+ bytes_to_convert = bytes_left;
+
+ /* copy_converting may read more than bytes_to_convert
+ bytes its input ends in an incomplete byte sequence. */
+ extra_written = copy_converting (bytes_to_convert);
+
+ bytes_left -= bytes_to_convert + extra_written;
}
+
+ /* Check if we have gone past any anchors and
+ adjust with output_bytes_difference. */
+ if (anchor_to_adjust)
+ while ((*anchor_to_adjust)->nodestart - nodestart
+ <= inptr - input_start)
+ {
+ (*anchor_to_adjust)->nodestart -= output_bytes_difference;
+ anchor_to_adjust++;
+ if (!*anchor_to_adjust || (*anchor_to_adjust)->nodelen != 0)
+ {
+ anchor_to_adjust = 0;
+ break;
+ }
+ }
}
}
else
- {
- inptr += n;
- }
+ inptr += n;
}
static void
@@ -630,16 +951,16 @@ colon_after_newline (char *nodeptr)
return -1;
}
-/* Remove syntax from (*NODE)->contents and build list of references
+/* Remove syntax from FB->tags[TAG]->contents and build list of references
in node. Adjust anchors in tag table that point into node text.*/
void
-scan_node_contents (NODE **tag)
+scan_node_contents (FILE_BUFFER *fb, int tag)
{
SEARCH_BINDING s;
char *search_string;
int found_menu_entry, in_index = 0;
- NODE *node = *tag;
+ NODE *node = fb->tags[tag];
REFERENCE **refs = NULL;
size_t refs_index = 0, refs_slots = 0;
@@ -654,11 +975,10 @@ scan_node_contents (NODE **tag)
else
rewrite_p = 0;
- if (rewrite_p)
- init_output_stream ();
+ init_output_stream (fb);
/* Set anchor_to_adjust to first anchor in node, if any. */
- anchor_to_adjust = tag + 1;
+ anchor_to_adjust = &fb->tags[tag + 1];
if (!*anchor_to_adjust)
anchor_to_adjust = 0;
else if (*anchor_to_adjust && (*anchor_to_adjust)->nodelen != 0)
@@ -675,6 +995,7 @@ scan_node_contents (NODE **tag)
all other assignment should be done with the helper functions above. */
inptr = node->contents;
input_start = node->contents;
+ input_length = node->nodelen;
nodestart = node->nodestart;
parse_top_node_line (node);
@@ -995,7 +1316,7 @@ search_again:
if (!rewrite_p)
{
rewrite_p = 1;
- init_output_stream ();
+ init_output_stream (fb);
/* Put inptr back to start so that
copy_input_to_output below gets all
@@ -1045,6 +1366,9 @@ search_again:
if (rewrite_p)
text_buffer_add_string (&output_buf, "\0", 1);
+ /* Free resources used in character encoding conversion. */
+ close_conversion ();
+
node->references = refs;
if (rewrite_p)
@@ -1285,6 +1609,7 @@ text_buffer_vprintf (struct text_buffer *buf, const char *format, va_list ap)
return n;
}
+/* Make sure there are LEN free bytes at end of BUF. */
void
text_buffer_alloc (struct text_buffer *buf, size_t len)
{
@@ -1297,6 +1622,39 @@ text_buffer_alloc (struct text_buffer *buf, size_t len)
}
}
+/* Return number of bytes that can be written to text buffer without
+ reallocating the text buffer. */
+size_t
+text_buffer_space_left (struct text_buffer *buf)
+{
+ /* buf->size is the offset of the first byte after the allocated space.
+ buf->off is the offset of the first byte to be written to. */
+ return buf->size - buf->off;
+}
+
+#if HAVE_ICONV
+
+/* Run iconv using text buffer as output buffer. */
+size_t
+text_buffer_iconv (struct text_buffer *buf, iconv_t iconv_state,
+ char **inbuf, size_t *inbytesleft)
+{
+ size_t out_bytes_left;
+ char *outptr;
+ size_t iconv_ret;
+
+ outptr = text_buffer_base (buf) + text_buffer_off (buf);
+ out_bytes_left = text_buffer_space_left (buf);
+ iconv_ret = iconv (iconv_to_output, inbuf, inbytesleft,
+ &outptr, &out_bytes_left);
+
+ text_buffer_off (buf) = outptr - text_buffer_base (buf);
+
+ return iconv_ret;
+}
+
+#endif /* HAVE_ICONV */
+
size_t
text_buffer_add_string (struct text_buffer *buf, const char *str, size_t len)
{
diff --git a/info-utils.h b/info-utils.h
index 60825bf..88c2e9f 100644
--- a/info-utils.h
+++ b/info-utils.h
@@ -26,6 +26,10 @@
#include "window.h"
#include "search.h"
+#if HAVE_ICONV
+# include
+#endif
+
/* When non-zero, various display and input functions handle ISO Latin
character sets correctly. */
extern int ISO_Latin_p;
@@ -56,10 +60,8 @@ extern char *info_parsed_nodename;
*/
int info_parse_node (char *string, int flag);
-/* NODE points to a tag table entry. Scan (*NODE)->contents for references
- and set (*NODE)->references. If preprocess_nodes_p=On, remove syntax
- from NODE->contents. Adjust anchors in tag table in this node. */
-void scan_node_contents (NODE **node);
+/* Scan contents of FB->tags[TAG]. */
+void scan_node_contents (FILE_BUFFER *fb, int tag);
/* Get the entry associated with LABEL in REFERENCES. Return a pointer to
the reference if found, or NULL. */
@@ -120,6 +122,11 @@ void text_buffer_free (struct text_buffer *buf);
void text_buffer_alloc (struct text_buffer *buf, size_t len);
size_t text_buffer_vprintf (struct text_buffer *buf, const char *format,
va_list ap);
+size_t text_buffer_space_left (struct text_buffer *buf);
+#if HAVE_ICONV
+size_t text_buffer_iconv (struct text_buffer *buf, iconv_t iconv_state,
+ char **inbuf, size_t *inbytesleft);
+#endif
size_t text_buffer_add_string (struct text_buffer *buf, const char *str,
size_t len);
size_t text_buffer_fill (struct text_buffer *buf, int c, size_t len);
diff --git a/nodes.c b/nodes.c
index 8541c8f..ce3df00 100644
--- a/nodes.c
+++ b/nodes.c
@@ -42,6 +42,7 @@ static void get_tags_of_indirect_tags_table (FILE_BUFFER *file_buffer,
SEARCH_BINDING *indirect_binding, SEARCH_BINDING *tags_binding);
static void info_reload_file_buffer_contents (FILE_BUFFER *fb);
static char *adjust_nodestart (NODE *node, int min, int max);
+static void get_file_character_encoding (FILE_BUFFER *fb);
static FILE_BUFFER *info_load_file_internal (char *filename, int get_tags);
static FILE_BUFFER *info_find_file_internal (char *filename, int get_tags);
static NODE *info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer,
@@ -340,6 +341,53 @@ info_find_file_internal (char *filename, int get_tags)
return file_buffer;
}
+/* Look for local variables section in FB and set encoding */
+static void
+get_file_character_encoding (FILE_BUFFER *fb)
+{
+ SEARCH_BINDING binding;
+ long position;
+
+ long int enc_start, enc_end;
+ char *enc_string;
+
+ char **encoding_name;
+
+ /* See if there is a local variables section in this info file. */
+ binding.buffer = fb->contents;
+ binding.start = fb->filesize;
+ binding.end = binding.start - 1000;
+ if (binding.end < 0)
+ binding.end = 0;
+ binding.flags = S_FoldCase;
+
+ /* Null means the encoding is unknown. */
+ fb->encoding = 0;
+
+ if (search_backward (LOCAL_VARIABLES_LABEL, &binding, &position)
+ != search_success)
+ return;
+
+ binding.start = position;
+ binding.end = fb->filesize;
+
+ if (search_forward (CHARACTER_ENCODING_LABEL, &binding, &enc_start)
+ != search_success)
+ return;
+
+ enc_start += strlen(CHARACTER_ENCODING_LABEL); /* Skip to after "coding:" */
+ enc_start += skip_whitespace(fb->contents + enc_start);
+ binding.start = enc_start;
+
+ search_forward ("\n", &binding, &enc_end);
+
+ enc_string = xmalloc (enc_end - enc_start + 1);
+ strncpy (enc_string, fb->contents + enc_start, enc_end - enc_start);
+ enc_string[enc_end - enc_start] = '\0';
+
+ fb->encoding = enc_string;
+}
+
/* The workhorse function for info_load_file (). Non-zero second argument
says to build a list of tags (or nodes) for this file. This is the
default behaviour when info_load_file () is called, but it is not
@@ -412,6 +460,9 @@ info_load_file_internal (char *filename, int get_tags)
if (compressed)
file_buffer->flags |= N_IsCompressed;
+ /* Find encoding of file, if set */
+ get_file_character_encoding (file_buffer);
+
/* If requested, build the tags and nodes for this file buffer. */
if (get_tags)
build_tags_and_nodes (file_buffer);
@@ -1058,7 +1109,7 @@ info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer, char *nodename)
/* Read locations of references in node and similar. Rewrite
node from tag->contents if preprocess_nodes=On. */
- scan_node_contents (&file_buffer->tags[i]);
+ scan_node_contents (file_buffer, i);
*node = *tag;
}
diff --git a/nodes.h b/nodes.h
index 5f0a23d..ac0d432 100644
--- a/nodes.h
+++ b/nodes.h
@@ -98,6 +98,9 @@ typedef struct {
#define TAGS_TABLE_BEG_LABEL "Tag Table:\n"
#define INDIRECT_TAGS_TABLE_LABEL "Indirect:\n"
#define TAGS_TABLE_IS_INDIRECT_LABEL "(Indirect)"
+#define LOCAL_VARIABLES_LABEL "Local Variables"
+#define CHARACTER_ENCODING_LABEL "coding:"
+
/* Character constants. */
#define INFO_COOKIE '\037'
@@ -121,6 +124,7 @@ typedef struct {
NODE **tags; /* If non-null, the indirect tags table. */
size_t tags_slots; /* Number of slots allocated for TAGS. */
int flags; /* Various flags. Mimics of N_* flags. */
+ char *encoding; /* Name of character encoding of file. */
} FILE_BUFFER;
/* Externally visible functions. */