From 2cf5d730690dad600f8b6d74d0b5fde522804e43 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sun, 22 Jul 2018 09:50:20 -0700 Subject: [PATCH] df: avoid multibyte character corruption on macOS This improves on the earlier fix for the problem reported by Chih-Hsuan Yen (Bug#32236), by also looking for other control characters and for encoding errors. * src/df.c: Include wchar.h and wctype.h instead of c-ctype.h. (hide_problematic_chars): Process the string as multibyte. Use iswcntrl, not c_iscntrl. --- src/df.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/src/df.c b/src/df.c index c851fcc..d27ba02 100644 --- a/src/df.c +++ b/src/df.c @@ -23,7 +23,8 @@ #include #include #include -#include +#include +#include #include "system.h" #include "canonicalize.h" @@ -272,21 +273,41 @@ static struct option const long_options[] = {NULL, 0, NULL, 0} }; -/* Replace problematic chars with '?'. - Since only control characters are currently considered, - this should work in all encodings. */ +/* Replace problematic chars with '?'. */ -static char* +static void hide_problematic_chars (char *cell) { - char *p = cell; - while (*p) + char *srcend = cell + strlen (cell); + char *dst = cell; + mbstate_t mbstate = { 0, }; + size_t n; + + for (char *src = cell; src != srcend; src += n) { - if (c_iscntrl (to_uchar (*p))) - *p = '?'; - p++; + wchar_t wc; + size_t srcbytes = srcend - src; + n = mbrtowc (&wc, src, srcbytes, &mbstate); + bool ok = 0 < n && n <= srcbytes; + + if (ok) + ok = !iswcntrl (wc); + else + n = 1; + + if (ok) + { + memmove (dst, src, n); + dst += n; + } + else + { + *dst++ = '?'; + memset (&mbstate, 0, sizeof mbstate); + } } - return cell; + + *dst = '\0'; } /* Dynamically allocate a row of pointers in TABLE, which -- 2.7.4