diff -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.c info-locale-5405/trunk/info/nodes.c --- texinfo/trunk/info/nodes.c 2014-01-07 20:11:42.000000000 +0000 +++ info-locale-5405/trunk/info/nodes.c 2014-01-31 21:23:52.000000000 +0000 @@ -27,6 +27,9 @@ #include "info-utils.h" #include "tag.h" +#include +#include +#include #if defined (HANDLE_MAN_PAGES) # include "man.h" @@ -42,6 +45,8 @@ SEARCH_BINDING *indirect_binding, SEARCH_BINDING *tags_binding); static void info_reload_file_buffer_contents (FILE_BUFFER *fb); static char *adjust_nodestart (NODE *node, int min, int max); +static void set_file_lc_ctype (FILE_BUFFER *fb); +static void convert_characters (FILE_BUFFER *fb); static FILE_BUFFER *info_load_file_internal (char *filename, int get_tags); static FILE_BUFFER *info_find_file_internal (char *filename, int get_tags); static NODE *info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer, @@ -326,6 +331,258 @@ return file_buffer; } +char *encoding_names[] = { "US-ASCII", "UTF-8", "ISO-8859-1", "ISO-8859-2", + "ISO-8859-15", "koi8-r", "koi8-u", 0 }; + +/* Look for local variables section in FB and set encoding */ +static void +set_file_lc_ctype (FILE_BUFFER *fb) +{ + SEARCH_BINDING binding; + long position; + + long int enc_start, enc_end; + char *enc_string; + + char **encoding_name; + + /* See if there is a local variables section in this info file. */ + binding.buffer = fb->contents; + binding.start = fb->filesize; + binding.end = binding.start - 1000; + if (binding.end < 0) + binding.end = 0; + binding.flags = S_FoldCase; + + fb->lc_ctype = ENC_UNKNOWN; + + if (search_backward (LOCAL_VARIABLES_LABEL, &binding, &position) + != search_success) + return; + + binding.start = position; + binding.end = fb->filesize; + + if (search_forward ("coding:", &binding, &enc_start) + != search_success) + return; + + enc_start += 7; /* Skip to after "coding:" */ + enc_start += skip_whitespace(fb->contents + enc_start); + binding.start = enc_start; + + search_forward ("\n", &binding, &enc_end); + + enc_string = xmalloc (enc_end - enc_start + 1); + strncpy (enc_string, fb->contents + enc_start, enc_end - enc_start); + enc_string[enc_end - enc_start] = '\0'; + + for (encoding_name = encoding_names; *encoding_name != 0; encoding_name++) + if (!strcasecmp(enc_string, *encoding_name)) + fb->lc_ctype = encoding_name - encoding_names; +} + +/* The degrade functions read one character at *FROM and write out at + *TO a sequence of bytes representing that character in ASCII. *FROM + and *TO are both advanced past the read/written bytes + Calling code assumes that replacement strings are no more than + 4 characters. */ + +struct encoding_replacement +{ + char *from_string; + char *to_string; +}; + +static void +degrade_dummy (char **from, size_t *from_left, char **to, size_t *to_left) +{ + /* FIXME: Check if **to is in range 0x00 to 0x7F? */ + **to = **from; + (*from)++; (*to)++; + (*from_left)--; (*to_left)--; +} + +static void +degrade_utf8 (char **from, size_t *from_left, char **to, size_t *to_left) +{ + struct encoding_replacement er[] = { + {"\xe2\x80\x98","'"}, /* Opening quote */ + {"\xe2\x80\x99","'"}, /* Closing quote */ + {0, 0}}; + + struct encoding_replacement *erp; + + for (erp = er; erp->from_string != 0; erp++) + { + if (!strncmp (*from, erp->from_string, strlen (erp->from_string))) + { + strncpy(*to, erp->to_string, strlen(erp->to_string)); + *from += strlen (erp->from_string); + *from_left -= strlen (erp->from_string); + *to += strlen (erp->to_string); + *to_left -= strlen (erp->to_string); + return; + } + } + + /* Failing this, just copy a byte across */ + /* FIXME: Use SUB instead (^Z)? */ + **to = **from; + (*from)++; (*to)++; + (*from_left)--; (*to_left)--; +} + +/* Convert characters in the nodes for FB to the current locale */ +static void +convert_characters (FILE_BUFFER *fb) +{ + long node = 0, nextnode; + SEARCH_BINDING binding; + char *to_locale; + + iconv_t iconv_state; + int iconv_available = 0; + + void (*degrade_funcs[5])(char **, size_t *, + char **, size_t *) = { + degrade_dummy, degrade_utf8, degrade_dummy, + degrade_dummy, degrade_dummy }; + + /* Function to use to convert file locale to ASCII */ + void (*degrade)(char **, size_t *, char **, size_t *); + + if (fb->lc_ctype == ENC_UNKNOWN) return; + + /* Read environment locale */ + to_locale = nl_langinfo(CODESET); + + /* Don't degrade the contents if we are in fact + * in the right locale for the file */ + if (!strcasecmp(to_locale, encoding_names[fb->lc_ctype])) + return; + + degrade = degrade_funcs [fb->lc_ctype]; + + /* Check if an iconv conversion from file locale to system + * locale exists - if so we will try to use it. */ + iconv_state = iconv_open (to_locale, encoding_names[fb->lc_ctype]); + if (iconv_state != (iconv_t) -1) + iconv_available = 1; + + /* Return if no conversion function implemented */ + if (!iconv_available && degrade == degrade_dummy) return; + + /* Allocate space for the converted file buffer (including + terminating NULL). */ + char *new_contents = xcalloc (1, fb->filesize + 1); + size_t new_contents_allocated = fb->filesize; + char *outbuf = new_contents; + size_t out_bytes_left = fb->filesize; + + binding.buffer = fb->contents; + binding.start = 0; + binding.end = fb->filesize; + + /* Convert sections of the file separated by node separators. These + * will be preambles, nodes, tag tables, or local variable sections. + * We convert all of them, although probably only the nodes need to + * be converted. + * The second part of the condition makes us operate on the last + * section, which does not end with a node separator. */ + while ((nextnode = find_node_separator (&binding)) != -1 + || (node != fb->filesize && (nextnode = fb->filesize))) + { + char *inbuf; + size_t inbytesleft; + + /* Update search for next iteration */ + binding.start = nextnode + 1; + + /* Convert characters from node to nextnode */ + inbuf = binding.buffer + node; + inbytesleft = nextnode - node; + + while (inbuf < binding.buffer + nextnode) + { + int out_offset; /* Only used when reallocating */ + + if (iconv_available) + { + while (1) + { + size_t iconv_ret; + + iconv_ret = iconv (iconv_state, &inbuf, &inbytesleft, + &outbuf, &out_bytes_left); + + if (iconv_ret != (size_t) -1) + { + /* Success */ + /* iconv_close (iconv_state); */ + goto continue_node_loop; + } + + /* There's been an error while converting. */ + switch (errno) + { + case E2BIG: + /* Ran out of space in output buffer. Reallocate and + * try again. */ + out_offset = outbuf - new_contents; + new_contents_allocated *= 2; + new_contents = xrealloc(new_contents, + new_contents_allocated); + + /* Update outbuf */ + outbuf = new_contents + out_offset; + out_bytes_left = new_contents_allocated - out_offset; + + continue; + case EILSEQ: + /* Byte sequence in input buffer not recognized. Degrade + * to ASCII instead. + * (FIXME: Check that output encoding + * is backwards compatible with ASCII). */ + goto degrade_to_ascii; + case EINVAL: + /* Incomplete byte sequence at end of input buffer */ + goto degrade_to_ascii; + default: /* Unknown error - abort */ + return; + } + } + } + degrade_to_ascii: + /* Make sure that there is enough space to write + * replacement string. 4 bytes should be enough for one + * character */ + if (out_bytes_left <= 4) + { + out_offset = outbuf - new_contents; + new_contents_allocated *= 2; + new_contents = xrealloc(new_contents, + new_contents_allocated); + + /* Update outbuf */ + outbuf = new_contents + out_offset; + out_bytes_left = new_contents_allocated - out_offset; + } + + degrade(&inbuf, &inbytesleft, &outbuf, &out_bytes_left); + } + continue_node_loop: + node = nextnode; + node += skip_whitespace (binding.buffer + node); + } + + if (iconv_available) + iconv_close (iconv_state); + free(fb->contents); + fb->contents = new_contents; + fb->filesize = outbuf - new_contents; +} + /* The workhorse function for info_load_file (). Non-zero second argument says to build a list of tags (or nodes) for this file. This is the default behaviour when info_load_file () is called, but it is not @@ -397,7 +654,14 @@ file_buffer->contents = contents; if (compressed) file_buffer->flags |= N_IsCompressed; + + /* Find encoding of file, if set */ + set_file_lc_ctype(file_buffer); + /* Convert characters in file buffer to current locale as much + * as possible. */ + convert_characters (file_buffer); + /* If requested, build the tags and nodes for this file buffer. */ if (get_tags) build_tags_and_nodes (file_buffer); diff -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.h info-locale-5405/trunk/info/nodes.h --- texinfo/trunk/info/nodes.h 2013-12-28 17:11:03.000000000 +0000 +++ info-locale-5405/trunk/info/nodes.h 2014-01-31 21:28:26.000000000 +0000 @@ -72,6 +72,7 @@ #define TAGS_TABLE_BEG_LABEL "Tag Table:\n" #define INDIRECT_TAGS_TABLE_LABEL "Indirect:\n" #define TAGS_TABLE_IS_INDIRECT_LABEL "(Indirect)" +#define LOCAL_VARIABLES_LABEL "Local Variables" /* Character constants. */ #define INFO_COOKIE '\037' @@ -112,7 +113,16 @@ TAG **tags; /* If non-null, the indirect tags table. */ size_t tags_slots; /* Number of slots allocated for TAGS. */ int flags; /* Various flags. Mimics of N_* flags. */ + int lc_ctype; /* Encoding - index into encoding_names */ } FILE_BUFFER; + +/* Null-terminated array of strings naming character encodings that Info + files could be encoded in. */ +extern char *encoding_names[]; + +/* Value of FILE_BUFFER.lc_ctype if encoding is unknown */ +#define ENC_UNKNOWN -1 + /* Externally visible functions. */