diff -u -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.c info-locale-5405/trunk/info/nodes.c --- texinfo/trunk/info/nodes.c 2014-01-07 20:11:42.000000000 +0000 +++ info-locale-5405/trunk/info/nodes.c 2014-02-02 17:16:53.000000000 +0000 @@ -27,6 +27,11 @@ #include "info-utils.h" #include "tag.h" +#include +#include +#if HAVE_ICONV +# include +#endif #if defined (HANDLE_MAN_PAGES) # include "man.h" @@ -42,6 +47,8 @@ SEARCH_BINDING *indirect_binding, SEARCH_BINDING *tags_binding); static void info_reload_file_buffer_contents (FILE_BUFFER *fb); static char *adjust_nodestart (NODE *node, int min, int max); +static void get_file_character_encoding (FILE_BUFFER *fb); +static void convert_characters (FILE_BUFFER *fb); static FILE_BUFFER *info_load_file_internal (char *filename, int get_tags); static FILE_BUFFER *info_find_file_internal (char *filename, int get_tags); static NODE *info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer, @@ -326,6 +333,301 @@ return file_buffer; } +/* Look for local variables section in FB and set encoding */ +static void +get_file_character_encoding (FILE_BUFFER *fb) +{ + SEARCH_BINDING binding; + long position; + + long int enc_start, enc_end; + char *enc_string; + + char **encoding_name; + + /* See if there is a local variables section in this info file. */ + binding.buffer = fb->contents; + binding.start = fb->filesize; + binding.end = binding.start - 1000; + if (binding.end < 0) + binding.end = 0; + binding.flags = S_FoldCase; + + /* Assume file is in UTF-8 by default. */ + fb->encoding = "UTF-8"; + + if (search_backward (LOCAL_VARIABLES_LABEL, &binding, &position) + != search_success) + return; + + binding.start = position; + binding.end = fb->filesize; + + if (search_forward (CHARACTER_ENCODING_LABEL, &binding, &enc_start) + != search_success) + return; + + enc_start += strlen(CHARACTER_ENCODING_LABEL); /* Skip to after "coding:" */ + enc_start += skip_whitespace(fb->contents + enc_start); + binding.start = enc_start; + + search_forward ("\n", &binding, &enc_end); + + enc_string = xmalloc (enc_end - enc_start + 1); + strncpy (enc_string, fb->contents + enc_start, enc_end - enc_start); + enc_string[enc_end - enc_start] = '\0'; + + fb->encoding = enc_string; +} + +struct encoding_replacement +{ + char *from_string; + char *to_string; +}; + +/* Read one character at *FROM and write out at *TO a sequenceo + of bytes representing that character in ASCII. *FROM + and *TO are both advanced past the read/written bytes. Calling code + assumes that replacement strings are no more than 4 characters. */ +static void +degrade_utf8 (char **from, size_t *from_left, char **to, size_t *to_left) +{ + struct encoding_replacement er[] = { + {"\xE2\x80\x98","'"}, /* Opening single quote */ + {"\xE2\x80\x99","'"}, /* Closing single quote */ + {"\xE2\x80\x9C","\""},/* Opening double quote */ + {"\xE2\x80\x9D","\""},/* Closing double quote */ + {"\xC2\xA9","(C)"}, /* Copyright symbol */ + {"\xC2\xBB",">>"}, /* Closing double angle brackets */ + {"\xE2\x86\x92","->"},/* Right arrow */ + {0, 0}}; + + struct encoding_replacement *erp; + + for (erp = er; erp->from_string != 0; erp++) + { + if (!strncmp (*from, erp->from_string, strlen (erp->from_string))) + { + strncpy(*to, erp->to_string, strlen(erp->to_string)); + *from += strlen (erp->from_string); + *to += strlen (erp->to_string); + *to_left -= strlen (erp->to_string); + return; + } + } + + /* Failing this, just copy a byte across */ + /* FIXME: Use SUB instead (^Z)? */ + **to = **from; + (*to)++; (*from)++; + (*to_left)--; (*from_left)--; +} + +/* Convert characters in the nodes for FB to the current locale */ +static void +convert_characters (FILE_BUFFER *fb) +{ +#if !HAVE_ICONV + return; +#else + long node = 0, nextnode; + SEARCH_BINDING binding; + char *target_encoding; + + char *new_contents, *outptr; + size_t new_contents_allocated; + size_t out_bytes_left; + + /* Used for conversion from file encoding to output encoding */ + iconv_t iconv_state; + + iconv_t iconv_to_utf8; + char utf8_char[4]; /* Maximum 4 bytes in a UTF-8 character */ + char *utf8_char_ptr = utf8_char; + + /* Whether file buffer is encoded in UTF-8 */ + int file_is_in_utf8 = 0; + + /* Used to check return value of iconv() */ + size_t iconv_ret; + + /* Read name of character encoding from environment locale */ + target_encoding = nl_langinfo(CODESET); + + /* Don't convert the contents if the locale + uses the same character encoding as the file */ + if (!strcasecmp(target_encoding, fb->encoding)) + return; + + /* Check if an iconv conversion from file locale to system + locale exists */ + iconv_state = iconv_open (target_encoding, fb->encoding); + if (iconv_state == (iconv_t) -1) + return; /* Return if no conversion function implemented */ + + if ( !strcasecmp ("UTF8", fb->encoding) + || !strcasecmp ("UTF-8", fb->encoding)) + file_is_in_utf8 = 1; + + if (!file_is_in_utf8) + { + iconv_to_utf8 = iconv_open ("UTF-8", fb->encoding); + if (iconv_to_utf8 == (iconv_t) -1) + return; /* Return if no conversion function implemented */ + } + + /* Allocate space for the converted file buffer (including + terminating NULL). */ + new_contents = xcalloc (1, fb->filesize + 1); + new_contents_allocated = fb->filesize; + outptr = new_contents; + out_bytes_left = fb->filesize; + + binding.buffer = fb->contents; + binding.start = 0; + binding.end = fb->filesize; + + /* Convert sections of the file separated by node separators. These + will be preambles, nodes, tag tables, or local variable sections. + We convert all of them, although probably only the nodes need to + be converted. + The second part of the condition makes us operate on the last + section, which does not end with a node separator. */ + while ((nextnode = find_node_separator (&binding)) != -1 + || (node != fb->filesize && (nextnode = fb->filesize))) + { + char *inptr; + size_t in_bytes_left; + + /* Update search for next iteration */ + binding.start = nextnode + 1; + + /* Convert characters from node to nextnode */ + inptr = binding.buffer + node; + in_bytes_left = nextnode - node; + + while (inptr < binding.buffer + nextnode) + { + int out_offset; /* Only used when reallocating */ + + /* Attempt to convert node contents using iconv */ + while (1) + { + iconv_ret = iconv (iconv_state, &inptr, &in_bytes_left, + &outptr, &out_bytes_left); + + if (iconv_ret != (size_t) -1) + { + /* Success */ + goto continue_node_loop; + } + + /* There's been an error while converting. */ + switch (errno) + { + case E2BIG: + /* Ran out of space in output buffer. Reallocate and + try again. */ + out_offset = outptr - new_contents; + new_contents_allocated *= 2; + new_contents = xrealloc(new_contents, + new_contents_allocated); + + /* Update outptr */ + outptr = new_contents + out_offset; + out_bytes_left = new_contents_allocated - out_offset; + + continue; + case EILSEQ: + /* Byte sequence in input buffer not recognized. Degrade + to ASCII instead. + (FIXME: Check that output encoding + is backwards compatible with ASCII). */ + goto degrade_to_ascii; + case EINVAL: + /* Incomplete byte sequence at end of input buffer */ + goto degrade_to_ascii; + default: /* Unknown error - abort */ + return; + } + } + + degrade_to_ascii: + /* Make sure that there is enough space to write + * replacement string. 4 bytes should be enough for one + * character */ + if (out_bytes_left <= 4) + { + out_offset = outptr - new_contents; + new_contents_allocated *= 2; + new_contents = xrealloc(new_contents, + new_contents_allocated); + + /* Update outptr */ + outptr = new_contents + out_offset; + out_bytes_left = new_contents_allocated - out_offset; + } + + if (file_is_in_utf8) + { + degrade_utf8(&inptr, &in_bytes_left, &outptr, &out_bytes_left); + } + else + { + /* When a character in file cannot be represented in the output + encoding, convert the character to UTF-8, then call + degrade_utf8() to get an ASCII replacement. */ + + size_t utf8_char_free, i; + + /* First convert character at read pointer to UTF-8 */ + + utf8_char_ptr = utf8_char; + + /* We want to read exactly one character. Do this by + restricting size of outbut buffer. */ + for (i = 1; i <= 4; i++) + { + utf8_char_free = i; + iconv_ret = iconv(iconv_to_utf8, &inptr, &in_bytes_left, + &utf8_char_ptr, &utf8_char_free); + /* If we managed to write a character */ + if (utf8_char_ptr > utf8_char) break; + } + + /* errno == E2BIG if iconv ran out of output buffer, + which is expected. */ + if (iconv_ret == (size_t) -1 && errno != E2BIG) + { + /* Character is not recognized. Copy a single byte. */ + *outptr = *inptr; + outptr++; inptr++; + out_bytes_left--; in_bytes_left--; + } + else + { + utf8_char_ptr = utf8_char; + /* The value of i before or after this call doesn't + matter. */ + degrade_utf8(&utf8_char_ptr, &i, + &outptr, &out_bytes_left); + } + } + } + continue_node_loop: + node = nextnode; + node += skip_whitespace (binding.buffer + node); + } + + iconv_close (iconv_state); + if (!file_is_in_utf8) iconv_close (iconv_to_utf8); + free(fb->contents); + fb->contents = new_contents; + fb->filesize = outptr - new_contents; +#endif /* HAVE_ICONF */ +} + /* The workhorse function for info_load_file (). Non-zero second argument says to build a list of tags (or nodes) for this file. This is the default behaviour when info_load_file () is called, but it is not @@ -397,7 +699,14 @@ file_buffer->contents = contents; if (compressed) file_buffer->flags |= N_IsCompressed; + + /* Find encoding of file, if set */ + get_file_character_encoding (file_buffer); + /* Convert characters in file buffer to current locale as much + * as possible. */ + convert_characters (file_buffer); + /* If requested, build the tags and nodes for this file buffer. */ if (get_tags) build_tags_and_nodes (file_buffer); diff -u -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.h info-locale-5405/trunk/info/nodes.h --- texinfo/trunk/info/nodes.h 2013-12-28 17:11:03.000000000 +0000 +++ info-locale-5405/trunk/info/nodes.h 2014-02-01 17:25:21.000000000 +0000 @@ -72,6 +72,8 @@ #define TAGS_TABLE_BEG_LABEL "Tag Table:\n" #define INDIRECT_TAGS_TABLE_LABEL "Indirect:\n" #define TAGS_TABLE_IS_INDIRECT_LABEL "(Indirect)" +#define LOCAL_VARIABLES_LABEL "Local Variables" +#define CHARACTER_ENCODING_LABEL "coding:" /* Character constants. */ #define INFO_COOKIE '\037' @@ -112,7 +114,9 @@ TAG **tags; /* If non-null, the indirect tags table. */ size_t tags_slots; /* Number of slots allocated for TAGS. */ int flags; /* Various flags. Mimics of N_* flags. */ + char *encoding; /* Character encoding of file */ } FILE_BUFFER; + /* Externally visible functions. */