[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 14/18] encoding-guesser: New library to guess the encoding of a t
From: |
Ben Pfaff |
Subject: |
[PATCH 14/18] encoding-guesser: New library to guess the encoding of a text file. |
Date: |
Sat, 19 Mar 2011 17:10:00 -0700 |
This will be used by other new libraries in upcoming commits.
---
Smake | 3 +-
src/libpspp/automake.mk | 2 +
src/libpspp/encoding-guesser.c | 289 +++++++++++++++++++++++++++++++++
src/libpspp/encoding-guesser.h | 126 ++++++++++++++
tests/automake.mk | 6 +
tests/libpspp/encoding-guesser-test.c | 102 ++++++++++++
tests/libpspp/encoding-guesser.at | 143 ++++++++++++++++
7 files changed, 670 insertions(+), 1 deletions(-)
create mode 100644 src/libpspp/encoding-guesser.c
create mode 100644 src/libpspp/encoding-guesser.h
create mode 100644 tests/libpspp/encoding-guesser-test.c
create mode 100644 tests/libpspp/encoding-guesser.at
diff --git a/Smake b/Smake
index 7efa2cf..3a3235c 100644
--- a/Smake
+++ b/Smake
@@ -70,10 +70,11 @@ GNULIB_MODULES = \
sys_stat \
tempname \
trunc \
- unistd \
unictype/property-id-continue \
unictype/property-id-start \
unigbrk/uc-is-grapheme-break \
+ unistd \
+ unistr/u8-check \
unistr/u8-cpy \
unistr/u8-mbtouc \
unistr/u8-strlen \
diff --git a/src/libpspp/automake.mk b/src/libpspp/automake.mk
index 823bbb3..5cf660a 100644
--- a/src/libpspp/automake.mk
+++ b/src/libpspp/automake.mk
@@ -20,6 +20,8 @@ src_libpspp_libpspp_la_SOURCES = \
src/libpspp/copyleft.h \
src/libpspp/deque.c \
src/libpspp/deque.h \
+ src/libpspp/encoding-guesser.c \
+ src/libpspp/encoding-guesser.h \
src/libpspp/ext-array.c \
src/libpspp/ext-array.h \
src/libpspp/float-format.c \
diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c
new file mode 100644
index 0000000..9042e93
--- /dev/null
+++ b/src/libpspp/encoding-guesser.c
@@ -0,0 +1,289 @@
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "libpspp/encoding-guesser.h"
+
+#include <errno.h>
+#include <iconv.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistr.h>
+
+#include "libpspp/cast.h"
+#include "libpspp/i18n.h"
+
+#include "gl/localcharset.h"
+#include "gl/c-strcase.h"
+
+/* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info is a useful source
+ of information about encoding detection.
+*/
+
+/* Parses and returns the fallback encoding from ENCODING, which must be in one
+ of the forms described at the top of encoding-guesser.h. The returned
+ string might be ENCODING itself or a suffix of it, or it might be a
+ statically allocated string. */
+const char *
+encoding_guess_parse_encoding (const char *encoding)
+{
+ if (encoding == NULL
+ || !c_strcasecmp (encoding, "auto")
+ || !c_strcasecmp (encoding, "auto,locale")
+ || !c_strcasecmp (encoding, "locale"))
+ return locale_charset ();
+ else if (!c_strncasecmp (encoding, "auto,", 5))
+ return encoding + 5;
+ else
+ return encoding;
+}
+
+/* Returns true if ENCODING, which must be in one of the forms described at the
+ top of encoding-guesser.h, is one that performs encoding autodetection,
+ false otherwise. */
+bool
+encoding_guess_encoding_is_auto (const char *encoding)
+{
+ return (encoding == NULL
+ || (!c_strncasecmp (encoding, "auto", 4)
+ && (encoding[4] == ',' || encoding[4] == '\0')));
+}
+
+static uint16_t
+get_be16 (const uint8_t *data)
+{
+ return (data[0] << 8) | data[1];
+}
+
+static uint16_t
+get_le16 (const uint8_t *data)
+{
+ return (data[1] << 8) | data[0];
+}
+
+static uint32_t
+get_be32 (const uint8_t *data)
+{
+ return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
+
+}
+
+static uint32_t
+get_le32 (const uint8_t *data)
+{
+ return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0];
+
+}
+
+static const char *
+guess_utf16 (const uint8_t *data, size_t n)
+{
+ size_t even_nulls, odd_nulls;
+
+ if (n < ENCODING_GUESS_MIN && n % 2 != 0)
+ return NULL;
+
+ even_nulls = odd_nulls = 0;
+ while (n >= 2)
+ {
+ even_nulls += data[0] == 0;
+ odd_nulls += data[1] == 0;
+ if (data[0] == 0 && data[1] == 0)
+ return NULL;
+
+ data += 2;
+ n -= 2;
+ }
+
+ if (odd_nulls > even_nulls)
+ return "UTF-16LE";
+ else if (even_nulls > 0)
+ return "UTF-16BE";
+ else
+ return NULL;
+}
+
+static bool
+is_utf32 (const uint8_t *data, size_t n, uint32_t (*get_u32) (const uint8_t *))
+{
+ if (n < ENCODING_GUESS_MIN && n % 4 != 0)
+ return false;
+
+ while (n >= 4)
+ {
+ uint32_t uc = get_u32 (data);
+
+ if (uc < 0x09 || uc > 0x10ffff)
+ return false;
+
+ data += 4;
+ n -= 4;
+ }
+
+ return true;
+}
+
+/* Counts and returns the number of bytes, but no more than N, starting at S
+ that are ASCII text characters. */
+size_t
+encoding_guess_count_ascii (const void *s_, size_t n)
+{
+ const uint8_t *s = s_;
+ size_t ofs;
+
+ for (ofs = 0; ofs < n; ofs++)
+ if (!encoding_guess_is_ascii_text (s[ofs]))
+ break;
+ return ofs;
+}
+
+static bool
+is_all_utf8_text (const void *s_, size_t n)
+{
+ const uint8_t *s = s_;
+ size_t ofs;
+
+ ofs = 0;
+ while (ofs < n)
+ {
+ uint8_t c = s[ofs];
+ if (c < 0x80)
+ {
+ if (!encoding_guess_is_ascii_text (c))
+ return false;
+ ofs++;
+ }
+ else
+ {
+ ucs4_t uc;
+ int mblen;
+
+ mblen = u8_mbtoucr (&uc, s + ofs, n - ofs);
+ if (mblen < 0)
+ return mblen == -2;
+
+ ofs += mblen;
+ }
+ }
+ return true;
+}
+
+/* Attempts to guess the encoding of a text file based on ENCODING, an encoding
+ name in one of the forms described at the top of encoding-guesser.h, and
+ DATA, which contains the first N bytes of the file. Returns the guessed
+ encoding, which might be ENCODING itself or a suffix of it or a statically
+ allocated string.
+
+ Encoding autodetection only takes place if ENCODING actually specifies
+ autodetection. See encoding-guesser.h for details.
+
+ UTF-8 cannot be distinguished from other ASCII-based encodings until a
+ non-ASCII text character is encountered. If ENCODING specifies
+ autodetection and this function returns "ASCII", then the client should
+ process the input until it encounters an non-ASCII character (as returned by
+ encoding_guess_is_ascii_text()) and then use encoding_guess_tail_encoding()
+ to make a final encoding guess. See encoding-guesser.h for details.
+
+ N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
+ that. */
+const char *
+encoding_guess_head_encoding (const char *encoding,
+ const void *data_, size_t n)
+{
+ const uint8_t *data = data_;
+ const char *fallback_encoding;
+ const char *guess;
+
+ fallback_encoding = encoding_guess_parse_encoding (encoding);
+ if (!encoding_guess_encoding_is_auto (encoding))
+ return fallback_encoding;
+
+ if (n == 0)
+ return fallback_encoding;
+
+ if ((n >= ENCODING_GUESS_MIN || n % 4 == 0)
+ && (get_be32 (data) == 0xfeff || get_le32 (data) == 0xfeff))
+ return "UTF-32";
+
+ if (n >= 4)
+ {
+ uint32_t x = get_be32 (data);
+ if (x == 0x84319533)
+ return "GB-18030";
+ else if (x == 0xdd736673)
+ return "UTF-EBCDIC";
+ }
+
+ if ((n >= ENCODING_GUESS_MIN || n % 2 == 0)
+ && (get_be16 (data) == 0xfeff || get_le16 (data) == 0xfeff))
+ return "UTF-16";
+
+ if (n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf)
+ return "UTF-8";
+
+ guess = guess_utf16 (data, n);
+ if (guess != NULL)
+ return guess;
+
+ if (is_utf32 (data, n, get_be32))
+ return "UTF-32BE";
+ if (is_utf32 (data, n, get_le32))
+ return "UTF-32LE";
+
+ if (!is_encoding_ascii_compatible (fallback_encoding)
+ || !encoding_guess_tail_is_utf8 (data, n))
+ return fallback_encoding;
+
+ if (!c_strcasecmp (fallback_encoding, "UTF-8")
+ || !c_strcasecmp (fallback_encoding, "UTF8"))
+ return "UTF-8";
+
+ return "ASCII";
+}
+
+/* Returns an encoding guess based on ENCODING and the N bytes of text starting
+ at DATA. DATA should start with the first non-ASCII text character (as
+ determined by encoding_guess_is_ascii_text()) found in the input.
+
+ The return value will either be "UTF-8" or the fallback encoding for
+ ENCODING.
+
+ See encoding-guesser.h for intended use of this function.
+
+ N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
+ that starting with the first non-ASCII text character. */
+const char *
+encoding_guess_tail_encoding (const char *encoding,
+ const void *data, size_t n)
+{
+ return (encoding_guess_tail_is_utf8 (data, n)
+ ? "UTF-8"
+ : encoding_guess_parse_encoding (encoding));
+}
+
+/* Same as encoding_guess_tail_encoding() but returns true for UTF-8 or false
+ for the fallback encoding. */
+bool
+encoding_guess_tail_is_utf8 (const void *data, size_t n)
+{
+ return (n < ENCODING_GUESS_MIN
+ ? u8_check (data, n) == NULL
+ : is_all_utf8_text (data, n));
+}
+
diff --git a/src/libpspp/encoding-guesser.h b/src/libpspp/encoding-guesser.h
new file mode 100644
index 0000000..b697e7e
--- /dev/null
+++ b/src/libpspp/encoding-guesser.h
@@ -0,0 +1,126 @@
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef LIBPSPP_ENCODING_GUESSER_H
+#define LIBPSPP_ENCODING_GUESSER_H 1
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* A library for autodetecting the encoding of a text file.
+
+ Naming Encodings
+ ----------------
+
+ The encoding guesser starts with an encoding name in one of various
+ different forms. Some of the forms do not actually do any autodetection.
+ The encoding guesser will return the specified encoding without looking at
+ any file data:
+
+ - A valid IANA or system encoding name: These are returned as-is.
+
+ - "Locale": Translated to the encoding used by the system locale, as
+ returned by locale_charset().
+
+ The remaining forms that do perform autodetection are:
+
+ - "Auto," followed by a valid IANA or system encoding name (the "fallback
+ encoding"): Requests detection whether the input is encoded in UTF-8,
+ UTF-16, UTF-32, or a few other easily identifiable charsets. When a
+ particular character set cannot be recognized, the guesser falls back to
+ the encoding following the comma. UTF-8 detection works only for
+ ASCII-compatible character sets; other
+
+ - NULL or "Auto": As above, with the encoding used by the system locale as
+ the fallback encoding.
+
+ The above are suggested capitalizations but encoding names are not
+ case-sensitive.
+
+ The encoding_guess_parse_encoding() and encoding_guess_encoding_is_auto()
+ functions work with encoding names in these forms.
+
+ Usage
+ -----
+
+ 1. Call encoding_guess_head_encoding() with several bytes from the start of
+ the text file. Feed in at least ENCODING_GUESS_MIN bytes, unless the
+ file is shorter than that, but as many more as are conveniently
+ available. ENCODING_GUESS_SUGGESTED is a reasonable amount.
+
+ encoding_guess_head_encoding() returns its best guess at the file's
+ encoding. Ordinarily it returns a final guess that the client can use to
+ interpret the file, and you're all done. However, if it returns "ASCII"
+ and the original encoding name requests autodetection (which you can find
+ out by calling encoding_guess_encoding_is_auto()), then proceed to the
+ next step.
+
+ 2. The encoding guesser is confident that the stream uses an ASCII
+ compatible encoding, either UTF-8 or the fallback encoding. The client
+ may safely read and process the stream up to the first non-ASCII
+ character. If the stream continues to be ASCII all the way to its end,
+ then we're done.
+
+ The encoding guesser provides a pair of functions to detect non-ASCII
+ characters: encoding_guess_is_ascii_text() for single characters and
+ encoding_guess_count_ascii() as a convenient wrapper for whole buffers.
+
+ 3. Otherwise, the stream contains some non-ASCII data at some point. Now
+ the client should gather several bytes starting at this point, at least
+ ENCODING_GUESS_MIN, unless the file ends before that, but as many more as
+ are conveniently available. ENCODING_GUESS_SUGGESTED is a reasonable
+ amount.
+
+ The client should pass these bytes to encoding_guess_tail_encoding(),
+ which returns a best and final guess at the file's encoding, which is
+ either UTF-8 or the fallback encoding. Another alternative is
+ encoding_guess_tail_is_utf8(), which guesses the same way but has a
+ different form of return value.
+*/
+
+/* Minimum number of bytes for use in autodetection.
+ You should only pass fewer bytes to the autodetection routines if the file
+ is actually shorter than this. */
+#define ENCODING_GUESS_MIN 16
+
+/* Suggested minimum buffer size to use for autodetection. */
+#define ENCODING_GUESS_SUGGESTED 1024
+
+/* Parsing encoding names. */
+const char *encoding_guess_parse_encoding (const char *encoding);
+bool encoding_guess_encoding_is_auto (const char *encoding);
+
+/* Making an initial coding guess based on the start of a file. */
+const char *encoding_guess_head_encoding (const char *encoding,
+ const void *, size_t);
+
+/* Refining an initial ASCII coding guess using later non-ASCII bytes. */
+static inline bool encoding_guess_is_ascii_text (uint8_t c);
+size_t encoding_guess_count_ascii (const void *, size_t);
+bool encoding_guess_tail_is_utf8 (const void *, size_t);
+const char *encoding_guess_tail_encoding (const char *encoding,
+ const void *, size_t);
+
+/* Returns true if C is a byte that might appear in an ASCII text file,
+ false otherwise. */
+static inline bool
+encoding_guess_is_ascii_text (uint8_t c)
+{
+ return (c >= 0x20 && c < 0x7f) || (c >= 0x09 && c < 0x0e);
+}
+
+#endif /* libpspp/encoding-guesser.h */
diff --git a/tests/automake.mk b/tests/automake.mk
index 639af7e..7ef7d42 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -7,6 +7,7 @@ check_PROGRAMS += \
tests/language/lexer/command-name-test \
tests/libpspp/abt-test \
tests/libpspp/bt-test \
+ tests/libpspp/encoding-guesser-test \
tests/libpspp/heap-test \
tests/libpspp/hmap-test \
tests/libpspp/hmapx-test \
@@ -50,6 +51,10 @@ tests_libpspp_llx_test_SOURCES = \
tests_libpspp_llx_test_LDADD = gl/libgl.la $(LIBINTL)
tests_libpspp_llx_test_CFLAGS = $(AM_CFLAGS)
+tests_libpspp_encoding_guesser_test_SOURCES = \
+ tests/libpspp/encoding-guesser-test.c
+tests_libpspp_encoding_guesser_test_LDADD = src/libpspp/libpspp.la gl/libgl.la
+
tests_libpspp_heap_test_SOURCES = \
src/libpspp/heap.c \
src/libpspp/pool.c \
@@ -313,6 +318,7 @@ TESTSUITE_AT = \
tests/language/xforms/select-if.at \
tests/libpspp/abt.at \
tests/libpspp/bt.at \
+ tests/libpspp/encoding-guesser.at \
tests/libpspp/float-format.at \
tests/libpspp/heap.at \
tests/libpspp/hmap.at \
diff --git a/tests/libpspp/encoding-guesser-test.c
b/tests/libpspp/encoding-guesser-test.c
new file mode 100644
index 0000000..a20607e
--- /dev/null
+++ b/tests/libpspp/encoding-guesser-test.c
@@ -0,0 +1,102 @@
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "libpspp/encoding-guesser.h"
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libpspp/i18n.h"
+
+#include "gl/error.h"
+#include "gl/progname.h"
+#include "gl/xalloc.h"
+
+static void
+usage (void)
+{
+ printf ("usage: %s [OTHER_ENCODING] [BUFSIZE] < INPUT\n"
+ "where OTHER_ENCODING is the fallback encoding (default taken\n"
+ " from the current locale)\n"
+ " and BUFSIZE is the buffer size (default %d)\n",
+ program_name, ENCODING_GUESS_MIN);
+ exit (0);
+}
+
+int
+main (int argc, char *argv[])
+{
+ const char *encoding, *guess;
+ char *buffer;
+ int bufsize;
+ size_t n;
+ int i;
+
+ set_program_name (argv[0]);
+
+ i18n_init ();
+
+ encoding = NULL;
+ bufsize = 0;
+ for (i = 1; i < argc; i++)
+ {
+ const char *arg = argv[i];
+ if (!strcmp (arg, "--help"))
+ usage ();
+ else if (isdigit (arg[0]) && bufsize == 0)
+ {
+ bufsize = atoi (arg);
+ if (bufsize < ENCODING_GUESS_MIN)
+ error (1, 0, "buffer size %s is less than minimum size %d",
+ arg, ENCODING_GUESS_MIN);
+ }
+ else if (!isdigit (arg[0]) && encoding == NULL)
+ encoding = arg;
+ else
+ error (1, 0, "bad syntax; use `%s --help' for help", program_name);
+ }
+
+ if (bufsize == 0)
+ bufsize = ENCODING_GUESS_MIN;
+
+ buffer = xmalloc (bufsize);
+
+ n = fread (buffer, 1, bufsize, stdin);
+ guess = encoding_guess_head_encoding (encoding, buffer, n);
+ if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding))
+ while (n > 0)
+ {
+ size_t n_ascii = encoding_guess_count_ascii (buffer, n);
+ if (n == n_ascii)
+ n = fread (buffer, 1, bufsize, stdin);
+ else
+ {
+ memmove (buffer, buffer + n_ascii, n - n_ascii);
+ n -= n_ascii;
+ n += fread (buffer + n, 1, bufsize - n, stdin);
+
+ guess = encoding_guess_tail_encoding (encoding, buffer, n);
+ break;
+ }
+ }
+ puts (guess);
+
+ return 0;
+}
diff --git a/tests/libpspp/encoding-guesser.at
b/tests/libpspp/encoding-guesser.at
new file mode 100644
index 0000000..d63dc37
--- /dev/null
+++ b/tests/libpspp/encoding-guesser.at
@@ -0,0 +1,143 @@
+AT_BANNER([encoding guesser])
+
+AT_SETUP([ASCII])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([echo string | encoding-guesser-test Auto,ISO-8859-1], [0], [ASCII
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-8])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf '\346\227\245\346\234\254\350\252\236\n' |
encoding-guesser-test Auto,ISO-8859-1], [0], [UTF-8
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-8 starting with ASCII])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf
'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\346\227\245\346\234\254\350\252\236\n'
| encoding-guesser-test Auto,ISO-8859-1 32], [0], [UTF-8
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-16 with big-endian byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\376\377' | encoding-guesser-test Auto,ISO-8859-1],
+ [0], [UTF-16
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-16 with little-endian byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\377\376' | encoding-guesser-test Auto,ISO-8859-1],
+ [0], [UTF-16
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-16BE])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\0e\0n\0t\0r\0\351\0e\0\n' | encoding-guesser-test
Auto,ISO-8859-1],
+ [0], [UTF-16BE
+])
+AT_CLEANUP
+
+dnl Unicode U+XX00 characters are confusing in UTF-16 because they look
+dnl likely to be of the opposite endianness, so this tests for proper handling.
+AT_SETUP([UTF-16BE starting with U+0100])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\1\0\0e\0n\0t\0r\0\351\0e\0\n' | encoding-guesser-test
Auto,ISO-8859-1],
+ [0], [UTF-16BE
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-16LE])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf 'e\0n\0t\0r\0\351\0e\0\n\0' | encoding-guesser-test
Auto,ISO-8859-1],
+ [0], [UTF-16LE
+])
+AT_CLEANUP
+
+dnl Unicode U+XX00 characters are confusing in UTF-16 because they look
+dnl likely to be of the opposite endianness, so this tests for proper handling.
+AT_SETUP([UTF-16LE starting with U+0100])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\0\1e\0n\0t\0r\0\351\0e\0\n\0' | encoding-guesser-test
Auto,ISO-8859-1],
+ [0], [UTF-16LE
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-32 with big-endian byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\0\0\376\377' | encoding-guesser-test Auto,ISO-8859-1],
+ [0], [UTF-32
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-32 with little-endian byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\377\376\0\0' | encoding-guesser-test Auto,ISO-8859-1],
+ [0], [UTF-32
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-32BE])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf '\0\0\0e\0\0\0n\0\0\0t\0\0\0r\0\0\0\351\0\0\0e\0\0\0\n' |
encoding-guesser-test Auto,ISO-8859-1],
+ [0], [UTF-32BE
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-32LE])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([printf 'e\0\0\0n\0\0\0t\0\0\0r\0\0\0\351\0\0\0e\0\0\0\n\0\0\0' |
encoding-guesser-test Auto,ISO-8859-1],
+ [0], [UTF-32LE
+])
+AT_CLEANUP
+
+AT_SETUP([ISO-8859-1])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf 'entr\351e\n' | encoding-guesser-test Auto,ISO-8859-1],
+ [0], [ISO-8859-1
+])
+AT_CLEANUP
+
+AT_SETUP([GB-18030 with byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf '\204\061\225\063' | encoding-guesser-test Auto,ISO-8859-1],
+ [0], [GB-18030
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-EBCDIC with byte order mark])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf '\335\163\146\163' | encoding-guesser-test Auto,ISO-8859-1],
+ [0], [UTF-EBCDIC
+])
+AT_CLEANUP
+
+AT_SETUP([EUC-JP as Auto,EUC-JP])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings EUC-JP])
+AT_CHECK([printf '\244\241 \244\242 \244\243 \244\244 \244\245 \244\246
\244\247 \244\250 \244\251 \244\252\n' | encoding-guesser-test Auto,EUC-JP],
+ [0], [EUC-JP
+])
+AT_CLEANUP
+
+AT_SETUP([EUC-JP starting with ASCII as Auto,EUC-JP])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings EUC-JP])
+AT_CHECK([printf
'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx \244\241
\244\242 \244\243 \244\244 \244\245 \244\246 \244\247 \244\250 \244\251
\244\252\n' | encoding-guesser-test Auto,EUC-JP 32],
+ [0], [EUC-JP
+])
+AT_CLEANUP
+
+AT_SETUP([UTF-8 with character split across input buffers])
+AT_KEYWORDS([encoding guesser])
+AT_CHECK([supports_encodings ISO-8859-1])
+AT_CHECK([printf
'\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\343\201\201\343\201\202\343\201\203\343\201\204\343\201\205\343\201\206\343\201\207\343\201\210\343\201\211\343\201\212\n'
| encoding-guesser-test Auto,ISO-8859-1 32],
+ [0], [UTF-8
+])
+AT_CLEANUP
--
1.7.2.3
- [PATCH 00/18] rewrite PSPP lexer, Ben Pfaff, 2011/03/19
- [PATCH 01/18] data-reader: Remove unreachable "return" statements., Ben Pfaff, 2011/03/19
- [PATCH 07/18] str: New functions for checking for and removing string suffixes., Ben Pfaff, 2011/03/19
- [PATCH 10/18] i18n: New function recode_string_len()., Ben Pfaff, 2011/03/19
- [PATCH 09/18] i18n: New function uc_name()., Ben Pfaff, 2011/03/19
- [PATCH 14/18] encoding-guesser: New library to guess the encoding of a text file.,
Ben Pfaff <=
- [PATCH 04/18] output: New function text_item_create_nocopy()., Ben Pfaff, 2011/03/19
- [PATCH 05/18] str: New function ss_realloc()., Ben Pfaff, 2011/03/19
- [PATCH 13/18] i18n: New functions and data structure for obtaining encoding info., Ben Pfaff, 2011/03/19
- [PATCH 06/18] str: Rename ss_chomp() to ss_chomp_byte(), ds_chomp() to ds_chomp_byte()., Ben Pfaff, 2011/03/19
- [PATCH 02/18] file-name: Do not make output files line-buffered in fn_open()., Ben Pfaff, 2011/03/19
- [PATCH 12/18] identifier: Rename token_type_to_string() and make a new version., Ben Pfaff, 2011/03/19
- [PATCH 08/18] hash-functions: New function hash_case_bytes()., Ben Pfaff, 2011/03/19
- [PATCH 11/18] i18n: New functions for truncating strings in an arbitrary encoding., Ben Pfaff, 2011/03/19
- [PATCH 17/18] scan: New library for high-level PSPP syntax lexical analysis., Ben Pfaff, 2011/03/19
- [PATCH 15/18] u8-istream: New library for reading a text file and recoding to UTF-8., Ben Pfaff, 2011/03/19