diff --git a/builtins/printf.def b/builtins/printf.def index 9eca215..77a8159 100644 --- a/builtins/printf.def +++ b/builtins/printf.def @@ -859,15 +859,9 @@ tescape (estart, cp, lenp, sawc) *cp = '\\'; return 0; } - if (uvalue <= UCHAR_MAX) - *cp = uvalue; - else - { - temp = u32cconv (uvalue, cp); - cp[temp] = '\0'; - if (lenp) - *lenp = temp; - } + temp = utf32tomb (cp, uvalue); + if (lenp) + *lenp = temp; break; #endif diff --git a/externs.h b/externs.h index 09244fa..ff3f344 100644 --- a/externs.h +++ b/externs.h @@ -460,7 +460,7 @@ extern unsigned int falarm __P((unsigned int, unsigned int)); extern unsigned int fsleep __P((unsigned int, unsigned int)); /* declarations for functions defined in lib/sh/unicode.c */ -extern int u32cconv __P((unsigned long, char *)); +extern int utf32tomb __P((char *, unsigned long)); /* declarations for functions defined in lib/sh/winsize.c */ extern void get_new_window_size __P((int, int *, int *)); diff --git a/lib/sh/strtrans.c b/lib/sh/strtrans.c index 2265782..e410cff 100644 --- a/lib/sh/strtrans.c +++ b/lib/sh/strtrans.c @@ -28,6 +28,7 @@ #include #include +#include #include "shell.h" #ifdef ESC @@ -140,21 +141,10 @@ ansicstr (string, len, flags, sawc, rlen) for (v = 0; ISXDIGIT ((unsigned char)*s) && temp--; s++) v = (v * 16) + HEXVALUE (*s); if (temp == ((c == 'u') ? 4 : 8)) - { *r++ = '\\'; /* c remains unchanged */ - break; - } - else if (v <= UCHAR_MAX) - { - c = v; - break; - } else - { - temp = u32cconv (v, r); - r += temp; - continue; - } + r += utf32tomb (r, v); + break; #endif case '\\': break; diff --git a/lib/sh/unicode.c b/lib/sh/unicode.c index d34fa08..5cc96bf 100644 --- a/lib/sh/unicode.c +++ b/lib/sh/unicode.c @@ -36,13 +36,7 @@ #include -#ifndef USHORT_MAX -# ifdef USHRT_MAX -# define USHORT_MAX USHRT_MAX -# else -# define USHORT_MAX ((unsigned short) ~(unsigned short)0) -# endif -#endif +#include "bashintl.h" #if !defined (STREQ) # define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0) @@ -54,13 +48,14 @@ extern const char *locale_charset __P((void)); extern char *get_locale_var __P((char *)); #endif -static int u32init = 0; +const char *charset; static int utf8locale = 0; #if defined (HAVE_ICONV) static iconv_t localconv; #endif #ifndef HAVE_LOCALE_CHARSET +static char charset_buffer[40]={0}; static char * stub_charset () { @@ -68,168 +63,267 @@ stub_charset () locale = get_locale_var ("LC_CTYPE"); if (locale == 0 || *locale == 0) - return "ASCII"; - s = strrchr (locale, '.'); - if (s) { - t = strchr (s, '@'); - if (t) - *t = 0; - return ++s; + strcpy(charset_buffer, "ASCII"); } - else if (STREQ (locale, "UTF-8")) - return "UTF-8"; else - return "ASCII"; + { + s = strrchr (locale, '.'); + if (s) + { + t = strchr (s, '@'); + if (t) + *t = 0; + strcpy(charset_buffer, s); + } + else + { + strcpy(charset_buffer, locale); + } + /* free(locale) If we can Modify the buffer surely we need to free it?*/ + } + return charset_buffer; } #endif -/* u32toascii ? */ + +#if 0 int -u32tochar (wc, s) - wchar_t wc; +utf32tobig5 (s, c) char *s; + unsigned long c; { - unsigned long x; int l; - x = wc; - l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4); - - if (x <= UCHAR_MAX) - s[0] = x & 0xFF; - else if (x <= USHORT_MAX) /* assume unsigned short = 16 bits */ + if (c <= 0x7F) { - s[0] = (x >> 8) & 0xFF; - s[1] = x & 0xFF; + s[0] = (char)c; + l = 1; + } + else if ((c >= 0x8000) && (c <= 0xFFFF)) + { + s[0] = (char)(c>>8); + s[1] = (char)(c &0xFF); + l = 2; } else { - s[0] = (x >> 24) & 0xFF; - s[1] = (x >> 16) & 0xFF; - s[2] = (x >> 8) & 0xFF; - s[3] = x & 0xFF; + /* Error Invalid UTF-8 */ + l = 0; } s[l] = '\0'; - return l; + return l; } - +#endif int -u32toutf8 (wc, s) - wchar_t wc; +utf32toutf8 (s, c) char *s; + unsigned long c; { int l; - l = (wc < 0x0080) ? 1 : ((wc < 0x0800) ? 2 : 3); - - if (wc < 0x0080) - s[0] = (unsigned char)wc; - else if (wc < 0x0800) + if (c <= 0x7F) { - s[0] = (wc >> 6) | 0xc0; - s[1] = (wc & 0x3f) | 0x80; + s[0] = (char)c; + l = 1; + } + else if (c <= 0x7FF) + { + s[0] = (c >> 6) | 0xc0; /* 110x xxxx */ + s[1] = (c & 0x3f) | 0x80; /* 10xx xxxx */ + l = 2; + } + else if (c <= 0xFFFF) + { + s[0] = (c >> 12) | 0xe0; /* 1110 xxxx */ + s[1] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */ + s[2] = (c & 0x3f) | 0x80; /* 10xx xxxx */ + l = 3; + } + else if (c <= 0x1FFFFF) + { + s[0] = (c >> 18) | 0xf0; /* 1111 0xxx */ + s[1] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */ + s[2] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */ + s[3] = ( c & 0x3f) | 0x80; /* 10xx xxxx */ + l = 4; + } + else if (c <= 0x3FFFFFF) + { + s[0] = (c >> 24) | 0xf8; /* 1111 10xx */ + s[1] = ((c >> 18) & 0x3f) | 0x80; /* 10xx xxxx */ + s[2] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */ + s[3] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */ + s[4] = ( c & 0x3f) | 0x80; /* 10xx xxxx */ + l = 5; + } + else if (c <= 0x7FFFFFFF) + { + s[0] = (c >> 30) | 0xfc; /* 1111 110x */ + s[1] = ((c >> 24) & 0x3f) | 0x80; /* 10xx xxxx */ + s[2] = ((c >> 18) & 0x3f) | 0x80; /* 10xx xxxx */ + s[3] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */ + s[4] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */ + s[5] = ( c & 0x3f) | 0x80; /* 10xx xxxx */ + l = 6; } else { - s[0] = (wc >> 12) | 0xe0; - s[1] = ((wc >> 6) & 0x3f) | 0x80; - s[2] = (wc & 0x3f) | 0x80; + /* Error Invalid UTF-8 */ + l = 0; } s[l] = '\0'; return l; } +int +utf32toutf16 (s, c) + unsigned short *s; + unsigned long c; +{ + int l=0; + if (c < 0xD800) + { + // Valid character directly convertible to 16 bits + s[0] = (unsigned short)(c&0xFFFF); + l=1; + } + else if ( (c >= 0x0000E000) && (c <= 0x0010FFFF) ) + { + // Character will be converted to 2 UTF-16 elements + c -= 0x0010000; + s[0] = (unsigned short)((c >> 10) + 0xD800); /* 1101 10XX XXXX XXXX */ + s[1] = (unsigned short)((c & 0x3FFUL) + 0xDC00); /* 1101 11XX XXXX XXXX */ + l=2; + } + s[l] = 0; + return l; +} +int +utf32towchar (ws, c) + wchar_t *ws; + unsigned long c; +{ + int l=0; + if ( sizeof (wchar_t) == 4) + { + ws[0]=c; + l=1; + } + else if ( sizeof (wchar_t) == 2) + { + l=utf32toutf16(ws, c); + } + ws[l] = 0; + return l; +} /* convert a single unicode-32 character into a multibyte string and put the result in S, which must be large enough (at least MB_LEN_MAX bytes) */ int -u32cconv (c, s) - unsigned long c; +utf32tomb (s, c) char *s; + unsigned long c; { - wchar_t wc; - int n; + size_t n=0; + wchar_t wstr[3]; #if HAVE_ICONV - const char *charset; - char obuf[25], *optr; - size_t obytesleft; - const char *iptr; - size_t sn; + char utf8buf[25], *optr; + size_t obytesleft, sn; + ICONV_CONST char *iptr; +#endif +#if HAVE_NL_LANGINFO + char *codeset; #endif - wc = c; + if ( n == 0 ) + { + /* + * Encode Method 1 + * UTF 0x00 -> 0x7f = ASCII Just copy + */ + if ( c <= 0x7f ) + { + s[0]=(char)c; + n=1; + } + } #if __STDC_ISO_10646__ - if (sizeof (wchar_t) == 4) + if ( n == 0 ) { - n = wctomb (s, wc); - return n; + /* + * Encode Method 2 + * Use wcstombs + */ + if( utf32towchar(wstr, c) ) + n = wcstombs (s, wstr, MB_LEN_MAX); + if(n == -1) + /* Error Encoding so let another method try */ + n=0; } #endif #if HAVE_NL_LANGINFO - codeset = nl_langinfo (CODESET); - if (STREQ (codeset, "UTF-8")) + if ( n == 0 ) { - n = u32toutf8 (wc, s); - return n; + /* + * Encode Method 3 + * Targets UTF-8 cool just encode. + */ + codeset = nl_langinfo (CODESET); + if (STREQ (codeset, "UTF-8")) + n = utf32toutf8 (s, c); } #endif #if HAVE_ICONV - /* this is mostly from coreutils-8.5/lib/unicodeio.c */ - if (u32init == 0) + if ( n == 0 ) { + /* + * Encode Method 4 + * Lets try iconv. + */ # if HAVE_LOCALE_CHARSET charset = locale_charset (); /* XXX - fix later */ # else charset = stub_charset (); # endif - if (STREQ (charset, "UTF-8")) - utf8locale = 1; - else + /* this is mostly from coreutils-8.5/lib/unicodeio.c */ + if( STREQ (charset, "UTF-8")) + n = utf32toutf8 (s, c); + else { localconv = iconv_open (charset, "UTF-8"); - if (localconv == (iconv_t)-1) - localconv = iconv_open (charset, "ASCII"); + if (localconv != (iconv_t)-1) + { + sn = utf32toutf8 (utf8buf, c); + + optr = s; + obytesleft = MB_LEN_MAX; + iptr = utf8buf; + + iconv (localconv, NULL, NULL, NULL, NULL); /* Reset iconv internal state */ + if (iconv (localconv, &iptr, &sn, &optr, &obytesleft) == (size_t)-1) + n=0; + else + n=(optr - s); + } } - u32init = 1; } - - if (utf8locale) - { - n = u32toutf8 (wc, s); - return n; - } - - if (localconv == (iconv_t)-1) - { - n = u32tochar (wc, s); - return n; - } - - n = u32toutf8 (wc, s); - - optr = obuf; - obytesleft = sizeof (obuf); - iptr = s; - sn = n; - - iconv (localconv, NULL, NULL, NULL, NULL); - - if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1) - return n; /* You get utf-8 if iconv fails */ - - *optr = '\0'; - - /* number of chars to be copied is optr - obuf if we want to do bounds - checking */ - strcpy (s, obuf); - return (optr - obuf); #endif - n = u32tochar (wc, s); /* fallback */ - return n; + if ( n == 0 ) + { + /* + * Error Encoding + */ +#if MB_LEN_MAX > 13 + n=sprintf(s, "", c); /* s buffer only 24 characters long */ +#endif + builtin_warning (_("U+%08lx unsupported in destination charset \"%s\" "), c, charset); + } + s[n]=0; + return n; } #endif /* HANDLE_MULTIBYTE */