commit 7baf4f407ff3979824a942fbd2226ae57962fd41 Author: Gavin Smith Date: Thu Mar 20 22:52:26 2014 +0000 Character encoding conversion This change requires the iconv module to be added from gnulib. info-utils.c (input_length, convert_encoding_p, file_is_in_utf8) (iconv_to_output, iconv_to_utf8): New file-level variables. (init_conversion): New function. (init_output_stream): Arguments changed. Call init_conversion. (copy_direct, degrade_utf8, copy_converting): New functions. (copy_input_to_output): Conditionally convert character encoding of copied text. (scan_node_contents): Arguments changed. (text_buffer_space_left, text_buffer_iconv): New functions. nodes.c (get_file_character_encoding): New function. (info_load_file_internal): Call get_file_character_encoding. nodes.h (LOCAL_VARIABLES_LABEL, CHARACTER_ENCODING_LABEL): New preprocessor symbols. (FILE_BUFFER): New field 'encoding'. diff --git a/info-utils.c b/info-utils.c index 6c1f7e6..0b6fee0 100644 --- a/info-utils.c +++ b/info-utils.c @@ -23,6 +23,12 @@ #include "info-utils.h" #include "tag.h" +#include +#include +#if HAVE_ICONV +# include +#endif + #if defined (HANDLE_MAN_PAGES) # include "man.h" #endif /* HANDLE_MAN_PAGES */ @@ -452,18 +458,100 @@ static int rewrite_p; static char *input_start, *inptr; +/* Number of bytes in node contents. */ +static size_t input_length; + struct text_buffer output_buf; static NODE **anchor_to_adjust; static int nodestart; +/* Whether we are converting the character encoding of the file. */ +static int convert_encoding_p; + +#if HAVE_ICONV + +/* Whether text in file is encoded in UTF-8. */ +static int file_is_in_utf8; + +/* Used for conversion from file encoding to output encoding. */ +static iconv_t iconv_to_output; + +/* Conversion from file encoding to UTF-8. */ +static iconv_t iconv_to_utf8; + +#endif /* HAVE_ICONV */ + +void +init_conversion (FILE_BUFFER *fb) +{ + char *target_encoding; + + convert_encoding_p = 0; + +#if !HAVE_ICONV + return; +#else + file_is_in_utf8 = 0; + + /* Don't process file if encoding is unknown. */ + if (!fb->encoding) + return; + + /* Read name of character encoding from environment locale */ + target_encoding = nl_langinfo (CODESET); + + /* Don't convert the contents if the locale + uses the same character encoding as the file */ + if (!strcasecmp(target_encoding, fb->encoding)) + return; + + /* Check if an iconv conversion from file locale to system + locale exists */ + iconv_to_output = iconv_open (target_encoding, fb->encoding); + if (iconv_to_output == (iconv_t) -1) + return; /* Return if no conversion function implemented */ + + if ( !strcasecmp ("UTF8", fb->encoding) + || !strcasecmp ("UTF-8", fb->encoding)) + file_is_in_utf8 = 1; + + if (!file_is_in_utf8) + { + iconv_to_utf8 = iconv_open ("UTF-8", fb->encoding); + if (iconv_to_utf8 == (iconv_t) -1) + { + /* Return if no conversion function implemented */ + iconv_close (iconv_to_output); + return; + } + } + + convert_encoding_p = 1; + rewrite_p = 1; +#endif /* HAVE_ICONV */ +} + +void close_conversion (void) +{ +#if HAVE_ICONV + if (convert_encoding_p) + { + iconv_close (iconv_to_output); + if (!file_is_in_utf8) iconv_close (iconv_to_utf8); + } +#endif +} + /* Difference between the number of bytes input in the file and bytes output. */ static long int output_bytes_difference; static void -init_output_stream (void) +init_output_stream (FILE_BUFFER *fb) { + init_conversion (fb); + if (rewrite_p) { text_buffer_init (&output_buf); @@ -471,35 +559,268 @@ init_output_stream (void) } } +/* Copy bytes from input to output with no encoding conversion. */ +static void +copy_direct (size_t n) +{ + text_buffer_add_string (&output_buf, inptr, n); + inptr += n; +} + +/* Read one character at *FROM and write out a sequence + of bytes representing that character in ASCII. *FROM + is advanced past the read character. */ +static int +degrade_utf8 (char **from, size_t *from_left) +{ + static struct encoding_replacement + { + char *from_string; + char *to_string; + } er[] = { + {"\xE2\x80\x98","'"}, /* Opening single quote */ + {"\xE2\x80\x99","'"}, /* Closing single quote */ + {"\xE2\x80\x9C","\""},/* Opening double quote */ + {"\xE2\x80\x9D","\""},/* Closing double quote */ + {"\xC2\xA9","(C)"}, /* Copyright symbol */ + {"\xC2\xBB",">>"}, /* Closing double angle brackets */ + {"\xE2\x86\x92","->"},/* Right arrow */ + + {"\xC3\xA0","a`"}, /* Lower case letter a with grave accent */ + {"\xC3\xA2","a^"}, /* Lower case letter a with circumflex */ + {"\xC3\xA4","a\""}, /* Lower case letter a with diaeresis */ + {"\xC3\xA6","ae"}, /* Lower case letter ae ligature */ + {"\xC3\xA9","e'"}, /* Lower case letter e with acute accent */ + {"\xC3\xA8","e`"}, /* Lower case letter e with grave accent */ + {"\xC3\xAA","e^"}, /* Lower case letter e with circumflex */ + {"\xC3\xAB","e\""}, /* Lower case letter e with diaeresis */ + {"\xC3\xB6","o\""}, /* Lower case letter o with diaeresis */ + {"\xC3\xBC","u\""}, /* Lower case letter u with diaeresis */ + {"\xC3\xB1","n~"}, /* Lower case letter n with tilde */ + {"\xC3\x87","C,"}, /* Upper case letter C with cedilla */ + {"\xC3\xA7","c,"}, /* Lower case letter c with cedilla */ + {"\xC3\x9f","ss"}, /* Lower case letter sharp s */ + + {0, 0} + }; + + struct encoding_replacement *erp; + + for (erp = er; erp->from_string != 0; erp++) + { + /* Avoid reading past end of input. */ + int width = strlen (erp->from_string); + if (width > *from_left) + continue; + + if (!strncmp (erp->from_string, *from, width)) + { + text_buffer_add_string (&output_buf, erp->to_string, + strlen(erp->to_string)); + *from += width; + *from_left -= width; + return 1; + } + } + + /* Failing this, just print a question mark. Maybe we should use SUB + (^Z) (ASCII substitute character code) instead. */ + //text_buffer_add_string (&output_buf, inptr, 1); + + //text_buffer_add_string (&output_buf, "\x1A", 1); + text_buffer_add_string (&output_buf, "?", 1); + + /* Ideally we would advance one UTF-8 character. This would + require knowing its length in bytes. */ + (*from)++; + (*from_left)--; + + return 0; +} + +/* Convert N bytes from input to output encoding and write to + output buffer. Return number of bytes over N written. */ +static int +copy_converting (size_t n) +{ +#if !HAVE_ICONV + return 0; +#else + size_t bytes_left; + int extra_at_end; + size_t iconv_ret; + long output_start; + + size_t utf8_char_free; + char utf8_char[4]; /* Maximum 4 bytes in a UTF-8 character */ + char *utf8_char_ptr; + int i; + + /* Use n as an estimate of how many bytes will be required + in target encoding. */ + text_buffer_alloc (&output_buf, n); + + output_start = text_buffer_off (&output_buf); + bytes_left = n; + extra_at_end = 0; + while (bytes_left >= 0) + { + iconv_ret = text_buffer_iconv (&output_buf, iconv_to_output, + &inptr, &bytes_left); + + if (iconv_ret != (size_t) -1) + /* Success: all of input converted. */ + break; + + /* There's been an error while converting. */ + switch (errno) + { + case E2BIG: + /* Ran out of space in output buffer. Allocate more + and try again. */ + text_buffer_alloc (&output_buf, n); + continue; + case EILSEQ: + /* Byte sequence in input buffer not recognized. */ + break; + case EINVAL: + /* Incomplete byte sequence at end of input buffer. Try to read + more. */ + + /* input_length - 2 is offset of last-but-one byte within input. + This checks if there is at least one more byte within node + contents. */ + if (inptr - input_start + (bytes_left - 1) <= input_length - 2) + { + bytes_left++; + extra_at_end++; + } + else + { + copy_direct (bytes_left); + bytes_left = 0; + } + break; + default: /* Unknown error - abort */ + info_error (_("Error converting file character encoding.")); + + /* Skip past current input and hope we don't get an + error next time. */ + inptr += bytes_left; + return 0; + } + + /* Degrade to ASCII. */ + + if (file_is_in_utf8) + { + degrade_utf8 (&inptr, &bytes_left); + continue; + } + + /* If file is not in UTF-8, we degrade to ASCII in two steps: + first convert the character to UTF-8, then look up a replacement + string. Note that mixing iconv_to_output and iconv_to_utf8 + on the same input may not work well if the input encoding + is stateful. */ + + /* We want to read exactly one character. Do this by + restricting size of output buffer. */ + utf8_char_ptr = utf8_char; + for (i = 1; i <= 4; i++) + { + utf8_char_free = i; + iconv_ret = iconv (iconv_to_utf8, &inptr, &bytes_left, + &utf8_char_ptr, &utf8_char_free); + /* If we managed to write a character: */ + if (utf8_char_ptr > utf8_char) break; + } + + /* errno == E2BIG if iconv ran out of output buffer, + which is expected. */ + if (iconv_ret == (size_t) -1 && errno != E2BIG) + /* Character is not recognized. Copy a single byte. */ + copy_direct (1); + else + { + utf8_char_ptr = utf8_char; + /* i is width of UTF-8 character */ + degrade_utf8 (&utf8_char_ptr, &i); + } + } + + /* Must cast because the difference between unsigned size_t is always + positive. */ + output_bytes_difference += + (signed long) n + - (signed long) (text_buffer_off (&output_buf) - output_start); + + return extra_at_end; +#endif /* HAVE_ICONV */ +} + +/* Copy text from input node contents, possibly converting the + character encoding and adjusting anchor offsets at the same time. */ static void copy_input_to_output (size_t n) { if (rewrite_p) { - text_buffer_add_string (&output_buf, inptr, n); - inptr += n; + size_t bytes_left; - /* Check if we have gone past any anchors and - adjust with output_bytes_difference. */ - if (anchor_to_adjust) + bytes_left = n; + while (bytes_left > 0) { - while ((*anchor_to_adjust)->nodestart - nodestart - <= inptr - input_start) + if (!convert_encoding_p) { - (*anchor_to_adjust)->nodestart -= output_bytes_difference; - anchor_to_adjust++; - if (!*anchor_to_adjust || (*anchor_to_adjust)->nodelen != 0) + copy_direct (bytes_left); + bytes_left = 0; + } + else + { + size_t bytes_to_convert; + size_t extra_written; + + if (anchor_to_adjust) { - anchor_to_adjust = 0; - break; + char *first_anchor = + input_start + (*anchor_to_adjust)->nodestart; + + /* If there is an anchor in the input: */ + if (first_anchor <= inptr + bytes_left) + /* Convert enough to pass the first anchor in input. */ + bytes_to_convert = first_anchor - inptr + 1; + else + bytes_to_convert = bytes_left; } + else + bytes_to_convert = bytes_left; + + /* copy_converting may read more than bytes_to_convert + bytes its input ends in an incomplete byte sequence. */ + extra_written = copy_converting (bytes_to_convert); + + bytes_left -= bytes_to_convert + extra_written; } + + /* Check if we have gone past any anchors and + adjust with output_bytes_difference. */ + if (anchor_to_adjust) + while ((*anchor_to_adjust)->nodestart - nodestart + <= inptr - input_start) + { + (*anchor_to_adjust)->nodestart -= output_bytes_difference; + anchor_to_adjust++; + if (!*anchor_to_adjust || (*anchor_to_adjust)->nodelen != 0) + { + anchor_to_adjust = 0; + break; + } + } } } else - { - inptr += n; - } + inptr += n; } static void @@ -630,16 +951,16 @@ colon_after_newline (char *nodeptr) return -1; } -/* Remove syntax from (*NODE)->contents and build list of references +/* Remove syntax from FB->tags[TAG]->contents and build list of references in node. Adjust anchors in tag table that point into node text.*/ void -scan_node_contents (NODE **tag) +scan_node_contents (FILE_BUFFER *fb, int tag) { SEARCH_BINDING s; char *search_string; int found_menu_entry, in_index = 0; - NODE *node = *tag; + NODE *node = fb->tags[tag]; REFERENCE **refs = NULL; size_t refs_index = 0, refs_slots = 0; @@ -654,11 +975,10 @@ scan_node_contents (NODE **tag) else rewrite_p = 0; - if (rewrite_p) - init_output_stream (); + init_output_stream (fb); /* Set anchor_to_adjust to first anchor in node, if any. */ - anchor_to_adjust = tag + 1; + anchor_to_adjust = &fb->tags[tag + 1]; if (!*anchor_to_adjust) anchor_to_adjust = 0; else if (*anchor_to_adjust && (*anchor_to_adjust)->nodelen != 0) @@ -675,6 +995,7 @@ scan_node_contents (NODE **tag) all other assignment should be done with the helper functions above. */ inptr = node->contents; input_start = node->contents; + input_length = node->nodelen; nodestart = node->nodestart; parse_top_node_line (node); @@ -995,7 +1316,7 @@ search_again: if (!rewrite_p) { rewrite_p = 1; - init_output_stream (); + init_output_stream (fb); /* Put inptr back to start so that copy_input_to_output below gets all @@ -1045,6 +1366,9 @@ search_again: if (rewrite_p) text_buffer_add_string (&output_buf, "\0", 1); + /* Free resources used in character encoding conversion. */ + close_conversion (); + node->references = refs; if (rewrite_p) @@ -1285,6 +1609,7 @@ text_buffer_vprintf (struct text_buffer *buf, const char *format, va_list ap) return n; } +/* Make sure there are LEN free bytes at end of BUF. */ void text_buffer_alloc (struct text_buffer *buf, size_t len) { @@ -1297,6 +1622,39 @@ text_buffer_alloc (struct text_buffer *buf, size_t len) } } +/* Return number of bytes that can be written to text buffer without + reallocating the text buffer. */ +size_t +text_buffer_space_left (struct text_buffer *buf) +{ + /* buf->size is the offset of the first byte after the allocated space. + buf->off is the offset of the first byte to be written to. */ + return buf->size - buf->off; +} + +#if HAVE_ICONV + +/* Run iconv using text buffer as output buffer. */ +size_t +text_buffer_iconv (struct text_buffer *buf, iconv_t iconv_state, + char **inbuf, size_t *inbytesleft) +{ + size_t out_bytes_left; + char *outptr; + size_t iconv_ret; + + outptr = text_buffer_base (buf) + text_buffer_off (buf); + out_bytes_left = text_buffer_space_left (buf); + iconv_ret = iconv (iconv_to_output, inbuf, inbytesleft, + &outptr, &out_bytes_left); + + text_buffer_off (buf) = outptr - text_buffer_base (buf); + + return iconv_ret; +} + +#endif /* HAVE_ICONV */ + size_t text_buffer_add_string (struct text_buffer *buf, const char *str, size_t len) { diff --git a/info-utils.h b/info-utils.h index 60825bf..88c2e9f 100644 --- a/info-utils.h +++ b/info-utils.h @@ -26,6 +26,10 @@ #include "window.h" #include "search.h" +#if HAVE_ICONV +# include +#endif + /* When non-zero, various display and input functions handle ISO Latin character sets correctly. */ extern int ISO_Latin_p; @@ -56,10 +60,8 @@ extern char *info_parsed_nodename; */ int info_parse_node (char *string, int flag); -/* NODE points to a tag table entry. Scan (*NODE)->contents for references - and set (*NODE)->references. If preprocess_nodes_p=On, remove syntax - from NODE->contents. Adjust anchors in tag table in this node. */ -void scan_node_contents (NODE **node); +/* Scan contents of FB->tags[TAG]. */ +void scan_node_contents (FILE_BUFFER *fb, int tag); /* Get the entry associated with LABEL in REFERENCES. Return a pointer to the reference if found, or NULL. */ @@ -120,6 +122,11 @@ void text_buffer_free (struct text_buffer *buf); void text_buffer_alloc (struct text_buffer *buf, size_t len); size_t text_buffer_vprintf (struct text_buffer *buf, const char *format, va_list ap); +size_t text_buffer_space_left (struct text_buffer *buf); +#if HAVE_ICONV +size_t text_buffer_iconv (struct text_buffer *buf, iconv_t iconv_state, + char **inbuf, size_t *inbytesleft); +#endif size_t text_buffer_add_string (struct text_buffer *buf, const char *str, size_t len); size_t text_buffer_fill (struct text_buffer *buf, int c, size_t len); diff --git a/nodes.c b/nodes.c index 8541c8f..ce3df00 100644 --- a/nodes.c +++ b/nodes.c @@ -42,6 +42,7 @@ static void get_tags_of_indirect_tags_table (FILE_BUFFER *file_buffer, SEARCH_BINDING *indirect_binding, SEARCH_BINDING *tags_binding); static void info_reload_file_buffer_contents (FILE_BUFFER *fb); static char *adjust_nodestart (NODE *node, int min, int max); +static void get_file_character_encoding (FILE_BUFFER *fb); static FILE_BUFFER *info_load_file_internal (char *filename, int get_tags); static FILE_BUFFER *info_find_file_internal (char *filename, int get_tags); static NODE *info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer, @@ -340,6 +341,53 @@ info_find_file_internal (char *filename, int get_tags) return file_buffer; } +/* Look for local variables section in FB and set encoding */ +static void +get_file_character_encoding (FILE_BUFFER *fb) +{ + SEARCH_BINDING binding; + long position; + + long int enc_start, enc_end; + char *enc_string; + + char **encoding_name; + + /* See if there is a local variables section in this info file. */ + binding.buffer = fb->contents; + binding.start = fb->filesize; + binding.end = binding.start - 1000; + if (binding.end < 0) + binding.end = 0; + binding.flags = S_FoldCase; + + /* Null means the encoding is unknown. */ + fb->encoding = 0; + + if (search_backward (LOCAL_VARIABLES_LABEL, &binding, &position) + != search_success) + return; + + binding.start = position; + binding.end = fb->filesize; + + if (search_forward (CHARACTER_ENCODING_LABEL, &binding, &enc_start) + != search_success) + return; + + enc_start += strlen(CHARACTER_ENCODING_LABEL); /* Skip to after "coding:" */ + enc_start += skip_whitespace(fb->contents + enc_start); + binding.start = enc_start; + + search_forward ("\n", &binding, &enc_end); + + enc_string = xmalloc (enc_end - enc_start + 1); + strncpy (enc_string, fb->contents + enc_start, enc_end - enc_start); + enc_string[enc_end - enc_start] = '\0'; + + fb->encoding = enc_string; +} + /* The workhorse function for info_load_file (). Non-zero second argument says to build a list of tags (or nodes) for this file. This is the default behaviour when info_load_file () is called, but it is not @@ -412,6 +460,9 @@ info_load_file_internal (char *filename, int get_tags) if (compressed) file_buffer->flags |= N_IsCompressed; + /* Find encoding of file, if set */ + get_file_character_encoding (file_buffer); + /* If requested, build the tags and nodes for this file buffer. */ if (get_tags) build_tags_and_nodes (file_buffer); @@ -1058,7 +1109,7 @@ info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer, char *nodename) /* Read locations of references in node and similar. Rewrite node from tag->contents if preprocess_nodes=On. */ - scan_node_contents (&file_buffer->tags[i]); + scan_node_contents (file_buffer, i); *node = *tag; } diff --git a/nodes.h b/nodes.h index 5f0a23d..ac0d432 100644 --- a/nodes.h +++ b/nodes.h @@ -98,6 +98,9 @@ typedef struct { #define TAGS_TABLE_BEG_LABEL "Tag Table:\n" #define INDIRECT_TAGS_TABLE_LABEL "Indirect:\n" #define TAGS_TABLE_IS_INDIRECT_LABEL "(Indirect)" +#define LOCAL_VARIABLES_LABEL "Local Variables" +#define CHARACTER_ENCODING_LABEL "coding:" + /* Character constants. */ #define INFO_COOKIE '\037' @@ -121,6 +124,7 @@ typedef struct { NODE **tags; /* If non-null, the indirect tags table. */ size_t tags_slots; /* Number of slots allocated for TAGS. */ int flags; /* Various flags. Mimics of N_* flags. */ + char *encoding; /* Name of character encoding of file. */ } FILE_BUFFER; /* Externally visible functions. */