>From 3df90147719110350d9a674cc37e99cbd27a9c3e Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Fri, 3 Jan 2020 22:34:07 +0100 Subject: [PATCH 1/5] mbrtowc: Refactor locale charset dispatching. * lib/lc-charset-dispatch.h: New file, extracted from lib/mbrtowc.c. * lib/lc-charset-dispatch.c: New file, extracted from lib/mbrtowc.c. * lib/mbrtowc.c: Include lc-charset-dispatch.h. Don't include localcharset.h, streq.h. (enc_t): Remove type. (locale_enc): Remove function. (cached_locale_enc): Remove variable. (locale_enc_cached): Remove function. (mbrtowc): Invoke locale_encoding_classification. * m4/mbrtowc.m4 (gl_PREREQ_MBRTOWC): Update comment. * modules/mbrtowc (Files): Add lc-charset-dispatch.h, lc-charset-dispatch.c. (configure.ac): Arrange to compile lc-charset-dispatch.c. --- ChangeLog | 17 ++++++++++ lib/lc-charset-dispatch.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++ lib/lc-charset-dispatch.h | 40 +++++++++++++++++++++++ lib/mbrtowc.c | 53 ++---------------------------- m4/mbrtowc.m4 | 2 +- modules/mbrtowc | 3 ++ 6 files changed, 145 insertions(+), 52 deletions(-) create mode 100644 lib/lc-charset-dispatch.c create mode 100644 lib/lc-charset-dispatch.h diff --git a/ChangeLog b/ChangeLog index 6c0d925..930f715 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +2020-01-03 Bruno Haible + + mbrtowc: Refactor locale charset dispatching. + * lib/lc-charset-dispatch.h: New file, extracted from lib/mbrtowc.c. + * lib/lc-charset-dispatch.c: New file, extracted from lib/mbrtowc.c. + * lib/mbrtowc.c: Include lc-charset-dispatch.h. Don't include + localcharset.h, streq.h. + (enc_t): Remove type. + (locale_enc): Remove function. + (cached_locale_enc): Remove variable. + (locale_enc_cached): Remove function. + (mbrtowc): Invoke locale_encoding_classification. + * m4/mbrtowc.m4 (gl_PREREQ_MBRTOWC): Update comment. + * modules/mbrtowc (Files): Add lc-charset-dispatch.h, + lc-charset-dispatch.c. + (configure.ac): Arrange to compile lc-charset-dispatch.c. + 2020-01-03 Paul Eggert doc: mention 32-bit time_t issue diff --git a/lib/lc-charset-dispatch.c b/lib/lc-charset-dispatch.c new file mode 100644 index 0000000..79057d4 --- /dev/null +++ b/lib/lc-charset-dispatch.c @@ -0,0 +1,82 @@ +/* Dispatching based on the current locale's character encoding. + Copyright (C) 2018-2020 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Bruno Haible , 2018. */ + +#include + +/* Specification. */ +#include "lc-charset-dispatch.h" + +#if GNULIB_defined_mbstate_t + +# include "localcharset.h" +# include "streq.h" + +# if GNULIB_WCHAR_SINGLE +/* When we know that the locale does not change, provide a speedup by + caching the value of locale_encoding_classification. */ +# define locale_encoding_classification_cached locale_encoding_classification +# else +/* By default, don't make assumptions, hence no caching. */ +# define locale_encoding_classification_uncached locale_encoding_classification +# endif + +# if GNULIB_WCHAR_SINGLE +static inline +# endif +enc_t +locale_encoding_classification_uncached (void) +{ + const char *encoding = locale_charset (); + if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) + return enc_utf8; + if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) + return enc_eucjp; + if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) + || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) + || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) + return enc_94; + if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) + return enc_euctw; + if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) + return enc_gb18030; + if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) + return enc_sjis; + return enc_other; +} + +# if GNULIB_WCHAR_SINGLE + +static int cached_locale_enc = -1; + +enc_t +locale_encoding_classification_cached (void) +{ + if (cached_locale_enc < 0) + cached_locale_enc = locale_encoding_classification_uncached (); + return cached_locale_enc; +} + +# endif + +#else + +/* This declaration is solely to ensure that after preprocessing + this file is never empty. */ +typedef int dummy; + +#endif diff --git a/lib/lc-charset-dispatch.h b/lib/lc-charset-dispatch.h new file mode 100644 index 0000000..95c2316 --- /dev/null +++ b/lib/lc-charset-dispatch.h @@ -0,0 +1,40 @@ +/* Dispatching based on the current locale's character encoding. + Copyright (C) 2018-2020 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Bruno Haible , 2018. */ + +#include + +#if GNULIB_defined_mbstate_t + +/* A classification of special values of the encoding of the current locale. */ +typedef enum + { + enc_other, /* other */ + enc_utf8, /* UTF-8 */ + enc_eucjp, /* EUC-JP */ + enc_94, /* EUC-KR, GB2312, BIG5 */ + enc_euctw, /* EUC-TW */ + enc_gb18030, /* GB18030 */ + enc_sjis /* SJIS */ + } + enc_t; + +/* Returns a classification of special values of the encoding of the current + locale. */ +extern enc_t locale_encoding_classification (void); + +#endif diff --git a/lib/mbrtowc.c b/lib/mbrtowc.c index 066d949..fdef8f9 100644 --- a/lib/mbrtowc.c +++ b/lib/mbrtowc.c @@ -54,9 +54,8 @@ # endif -# include "localcharset.h" -# include "streq.h" # include "verify.h" +# include "lc-charset-dispatch.h" # include "mbtowc-lock.h" # ifndef FALLTHROUGH @@ -67,54 +66,6 @@ # endif # endif -/* Returns a classification of special values of the encoding of the current - locale. */ -typedef enum { - enc_other, /* other */ - enc_utf8, /* UTF-8 */ - enc_eucjp, /* EUC-JP */ - enc_94, /* EUC-KR, GB2312, BIG5 */ - enc_euctw, /* EUC-TW */ - enc_gb18030, /* GB18030 */ - enc_sjis /* SJIS */ -} enc_t; -static inline enc_t -locale_enc (void) -{ - const char *encoding = locale_charset (); - if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) - return enc_utf8; - if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) - return enc_eucjp; - if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) - || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) - || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) - return enc_94; - if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) - return enc_euctw; - if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) - return enc_gb18030; - if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) - return enc_sjis; - return enc_other; -} - -# if GNULIB_WCHAR_SINGLE -/* When we know that the locale does not change, provide a speedup by - caching the value of locale_enc. */ -static int cached_locale_enc = -1; -static inline enc_t -locale_enc_cached (void) -{ - if (cached_locale_enc < 0) - cached_locale_enc = locale_enc (); - return cached_locale_enc; -} -# else -/* By default, don't make assumptions, hence no caching. */ -# define locale_enc_cached locale_enc -# endif - verify (sizeof (mbstate_t) >= 4); static char internal_state[4]; @@ -177,7 +128,7 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) /* Here m > 0. */ - enc = locale_enc_cached (); + enc = locale_encoding_classification (); if (enc == enc_utf8) /* UTF-8 */ { diff --git a/m4/mbrtowc.m4 b/m4/mbrtowc.m4 index bd9225b..755f8c9 100644 --- a/m4/mbrtowc.m4 +++ b/m4/mbrtowc.m4 @@ -821,7 +821,7 @@ AC_DEFUN([gl_MBRTOWC_C_LOCALE], ]) ]) -# Prerequisites of lib/mbrtowc.c. +# Prerequisites of lib/mbrtowc.c and lib/lc-charset-dispatch.c. AC_DEFUN([gl_PREREQ_MBRTOWC], [ AC_REQUIRE([AC_C_INLINE]) : diff --git a/modules/mbrtowc b/modules/mbrtowc index db10256..22afc96 100644 --- a/modules/mbrtowc +++ b/modules/mbrtowc @@ -3,6 +3,8 @@ mbrtowc() function: convert multibyte character to wide character. Files: lib/mbrtowc.c +lib/lc-charset-dispatch.h +lib/lc-charset-dispatch.c lib/mbtowc-lock.h lib/mbtowc-lock.c lib/windows-initguard.h @@ -29,6 +31,7 @@ configure.ac: gl_FUNC_MBRTOWC if test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1; then AC_LIBOBJ([mbrtowc]) + AC_LIBOBJ([lc-charset-dispatch]) AC_LIBOBJ([mbtowc-lock]) gl_PREREQ_MBRTOWC gl_PREREQ_MBTOWC_LOCK -- 2.7.4