bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 1/4] Fix width computation


From: Robbie Harwood
Subject: Re: [PATCH v3 1/4] Fix width computation
Date: Wed, 12 Jan 2022 14:48:11 -0500

Bruno Haible <bruno@clisp.org> writes:

> Robbie Harwood wrote:
>> From: Vladimir 'phcoder' Serbinenko <phcoder@gmail.com>
>> 
>> [rharwood@redhat.com: merge with later fix from pjones@redhat.com]
>> Signed-off-by: Robbie Harwood <rharwood@redhat.com>
>> ---
>>  lib/argp-fmtstream.c | 80 +++++++++++++++++++++++++++++++++++++-------
>>  lib/argp-help.c      |  3 +-
>>  lib/mbswidth.c       | 15 +++++++++
>>  lib/mbswidth.h       |  4 +++
>>  4 files changed, 88 insertions(+), 14 deletions(-)
>
> This patch will need several more iterations, as it mixes two good
> ideas, with - so far - imperfect implementations:
>
>   * The need to use wcwidth in argp-fmtstream arises because the
>     help strings are internationalized. But for internationalized strings,
>     line breaking by looking for spaces is just wrong. (It works for
>     Russian and Greek but not for Chinese.) A much better algorithm
>     (that works for most languages, except Thai) is found in
>     GNU libunistring 1.0. But the code in glibc cannot use libunistring;
>     so there likely will need to be some '#ifdef _LIBC'.
>
>   * The idea to parse escape sequences in a special way, before invoking
>     wcwidth, it nice. But's it's far from complete. IMO one needs to look
>     at GNU teseq, the ISO 2022 standard, and other de-facto standards
>     when implementing that. And it should then be implemented in wcswidth,
>     mbswidth, etc. uniformly.

I've put together at using libunistring (attached).  Unfortunately it
doesn't seem to find ulc_width_linebreaks() at link time, and I'm not
familiar enough with gnulib to understand why.  Feedback on that (or
other parts of it) appreciated.

Be well,
--Robbie

Attachment: signature.asc
Description: PGP signature

>From 12c09fc6632e295101ee6957c952bbf1b5c0165a Mon Sep 17 00:00:00 2001
From: Robbie Harwood <rharwood@redhat.com>
Date: Mon, 10 Jan 2022 17:17:39 -0500
Subject: [PATCH] argp-fmtstream: use ulc_width_linebreaks()

ulc_width_linebreaks() can't be used in _LIBC, so fall back to a
best-effort solution there that won't work with Chinese.

Signed-off-by: Robbie Harwood <rharwood@redhat.com>
---
 lib/argp-fmtstream.c | 361 +++++++++++++++++++------------------------
 modules/argp         |   1 +
 2 files changed, 161 insertions(+), 201 deletions(-)

diff --git a/lib/argp-fmtstream.c b/lib/argp-fmtstream.c
index 78c29e38c..12d26effb 100644
--- a/lib/argp-fmtstream.c
+++ b/lib/argp-fmtstream.c
@@ -28,9 +28,11 @@
 #include <errno.h>
 #include <stdarg.h>
 #include <ctype.h>
+#include <wchar.h>
 
 #include "argp-fmtstream.h"
 #include "argp-namefrob.h"
+#include "mbswidth.h"
 
 #ifndef ARGP_FMTSTREAM_USE_LINEWRAP
 
@@ -42,6 +44,8 @@
 # include <wchar.h>
 # include <libio/libioP.h>
 # define __vsnprintf(s, l, f, a) _IO_vsnprintf (s, l, f, a)
+#else
+# include <unilbrk.h>
 #endif
 
 #define INIT_BUF_SIZE 200
@@ -115,233 +119,188 @@ weak_alias (__argp_fmtstream_free, argp_fmtstream_free)
 #endif
 #endif
 
-/* Process FS's buffer so that line wrapping is done from POINT_OFFS to the
-   end of its buffer.  This code is mostly from glibc stdio/linewrap.c.  */
-void
-__argp_fmtstream_update (argp_fmtstream_t fs)
+/* Insert suffix and left margin at POINT_OFFS, flushing as needed.  */
+static void
+line_at_point (argp_fmtstream_t fs, const char *suffix)
 {
-  char *buf, *nl;
-  size_t len;
+  size_t i, space_needed = fs->wmargin;
+  char *nl, *queue = fs->buf + fs->point_offs;
 
-  /* Scan the buffer for newlines.  */
-  buf = fs->buf + fs->point_offs;
-  while (buf < fs->p)
+  if (suffix)
+    space_needed += strlen(suffix);
+
+  while (fs->p + space_needed > fs->end)
     {
-      size_t r;
-
-      if (fs->point_col == 0 && fs->lmargin != 0)
+      /* Output the first line so we can use the space.  */
+      nl = memchr (fs->buf, '\n', fs->point_offs);
+      if (nl == NULL)
         {
-          /* We are starting a new line.  Print spaces to the left margin.  */
-          const size_t pad = fs->lmargin;
-          if (fs->p + pad < fs->end)
-            {
-              /* We can fit in them in the buffer by moving the
-                 buffer text up and filling in the beginning.  */
-              memmove (buf + pad, buf, fs->p - buf);
-              fs->p += pad; /* Compensate for bigger buffer. */
-              memset (buf, ' ', pad); /* Fill in the spaces.  */
-              buf += pad; /* Don't bother searching them.  */
-            }
-          else
-            {
-              /* No buffer space for spaces.  Must flush.  */
-              size_t i;
-              for (i = 0; i < pad; i++)
-                {
+          /* Line longer than buffer - shouldn't happen.  Truncate.  */
+          nl = queue - 1;
+          *nl = '\n';
+        }
+
 #ifdef _LIBC
-                  if (_IO_fwide (fs->stream, 0) > 0)
-                    putwc_unlocked (L' ', fs->stream);
-                  else
+      __fxprintf (fs->stream, "%.*s\n",
+                  (int) (nl - fs->buf), fs->buf);
+#else
+      if (nl > fs->buf)
+        fwrite_unlocked (fs->buf, 1, nl - fs->buf, fs->stream);
+      putc_unlocked ('\n', fs->stream);
 #endif
-                    putc_unlocked (' ', fs->stream);
-                }
-            }
-          fs->point_col = pad;
-        }
 
-      len = fs->p - buf;
-      nl = memchr (buf, '\n', len);
+      memmove (fs->buf, nl + 1, nl + 1 - fs->buf);
+      fs->p -= nl + 1 - fs->buf;
+      fs->point_offs -= nl + 1 - fs->buf;
+    }
 
-      if (fs->point_col < 0)
-        fs->point_col = 0;
+  memmove (queue + space_needed, queue, fs->p - queue);
+  if (suffix)
+    {
+      memcpy (queue, suffix, strlen (suffix));
+      fs->point_offs += strlen (suffix);
+    }
+  for (i = 0; i < fs->lmargin && fs->point_col != -1; i++)
+    fs->buf[fs->point_offs++] = ' ';
+  fs->point_col = fs->lmargin;
+}
 
-      if (!nl)
+#ifdef _LIBC
+# define UC_BREAK_UNDEFINED 0
+# define UC_BREAK_POSSIBLE 1
+# define UC_BREAK_HYPHENATION 2
+# define UC_BREAK_MANDATORY 3
+static int
+uwl_shim(const char *s, size_t n, int width, int start_column,
+         const char *encoding, char *p)
+{
+  size_t i = 0, cur_col = start_column, last_option = 0, c_len;
+  mbstate_t ps;
+  wchar_t c;
+
+  memset(&ps, 0, sizeof (ps));
+  memset(p, UC_BREAK_UNDEFINED, n);
+
+  while (i < n)
+    {
+      c_len = mbrtowc (&c, s + i, n, &ps);
+      if (c_len == 0 || c_len == -2)
         {
-          /* The buffer ends in a partial line.  */
-
-          if (fs->point_col + len < fs->rmargin)
-            {
-              /* The remaining buffer text is a partial line and fits
-                 within the maximum line width.  Advance point for the
-                 characters to be written and stop scanning.  */
-              fs->point_col += len;
-              break;
-            }
-          else
-            /* Set the end-of-line pointer for the code below to
-               the end of the buffer.  */
-            nl = fs->p;
+          break;
         }
-      else if (fs->point_col + (nl - buf) < (ssize_t) fs->rmargin)
+      else if (c_len == -1)
         {
-          /* The buffer contains a full line that fits within the maximum
-             line width.  Reset point and scan the next line.  */
-          fs->point_col = 0;
-          buf = nl + 1;
+          /* Malformed.  Walk forward until things make sense again.  */
+          i++;
+          cur_col++;
           continue;
         }
 
-      /* This line is too long.  */
-      r = fs->rmargin - 1;
-
-      if (fs->wmargin < 0)
+      if (c == L'\n')
         {
-          /* Truncate the line by overwriting the excess with the
-             newline and anything after it in the buffer.  */
-          if (nl < fs->p)
+          p[i] = UC_BREAK_MANDATORY;
+          last_option = 0;
+          cur_col = 0;
+        }
+      else if (c == L' ' || c == '\t')
+        {
+          /* Best effort - won't help with Chinese or Thai.  */
+          last_option = i;
+        }
+      else if (cur_col >= width && last_option != 0)
+        {
+          p[last_option] = UC_BREAK_POSSIBLE;
+          last_option = 0;
+          cur_col = 0;
+        }
+
+      i += c_len;
+      cur_col += wcwidth(c);
+    }
+
+  return n - last_break;
+}
+#define ulc_width_linebreaks(s, n, width, start_column, at_end_columns, \
+                             o, encoding, p)                            \
+  uwl_shim(s, n, width, start_column, encoding, p)
+#endif
+
+/* Process FS's buffer so that line wrapping is done from POINT_OFFS to the
+   end of its buffer.  */
+void
+__argp_fmtstream_update (argp_fmtstream_t fs)
+{
+  char *lns, *nl, *c, *queue = fs->buf + fs->point_offs;
+  int col;
+  size_t i, proc_len = fs->p - queue;
+
+  if (queue >= fs->p)
+    return;
+
+  if (fs->wmargin == -1)
+    {
+      while (fs->buf + fs->point_offs < fs->p)
+        {
+          nl = memchr (fs->buf + fs->point_offs, '\n',
+                       fs->rmargin - fs->point_col);
+          if (nl)
             {
-              memmove (buf + (r - fs->point_col), nl, fs->p - nl);
-              fs->p -= buf + (r - fs->point_col) - nl;
-              /* Reset point for the next line and start scanning it.  */
-              fs->point_col = 0;
-              buf += r + 1; /* Skip full line plus \n. */
+              fs->point_offs = nl - queue;
+              line_at_point (fs, NULL);
+              continue;
             }
-          else
+
+          /* Truncate until end of line.  */
+          fs->point_offs += fs->rmargin - fs->point_col;
+          nl = memchr (fs->buf + fs->point_offs + fs->rmargin - fs->point_col,
+                       '\n', fs->p - fs->buf - fs->point_offs);
+          if (! nl)
             {
-              /* The buffer ends with a partial line that is beyond the
-                 maximum line width.  Advance point for the characters
-                 written, and discard those past the max from the buffer.  */
-              fs->point_col += len;
-              fs->p -= fs->point_col - r;
-              break;
+              /* This is the last line.  */
+              fs->buf[fs->point_offs++] = '\n';
+              fs->p = fs->buf + fs->point_offs;
+              line_at_point(fs, NULL);
+              return;
             }
+
+          fs->buf[fs->point_offs++] = '\n';
+          memmove (fs->buf + fs->point_offs, nl + 1,
+                   nl + 1 - fs->buf - fs->point_offs);
+          line_at_point(fs, NULL);
+        }
+      return;
+    }
+
+  lns = (char *) malloc (proc_len);
+  if (! lns)
+    return;
+  
+  col = ulc_width_linebreaks (queue, proc_len, fs->rmargin - fs->wmargin,
+                              fs->point_col == -1 ? 0 : fs->point_col, 0,
+                              NULL, locale_charset (), lns);
+  for (i = 0; i < proc_len; i++)
+    {
+      if (lns[i] == UC_BREAK_HYPHENATION)
+        {
+          line_at_point (fs, "-\n");
+        }
+      else if (lns[i] == UC_BREAK_POSSIBLE)
+        {
+          line_at_point (fs, "\n");
+        }
+      else if (lns[i] == UC_BREAK_MANDATORY)
+        {
+          fs->point_offs++;
+          line_at_point (fs, NULL);
         }
       else
         {
-          /* Do word wrap.  Go to the column just past the maximum line
-             width and scan back for the beginning of the word there.
-             Then insert a line break.  */
-
-          char *p, *nextline;
-          int i;
-
-          p = buf + (r + 1 - fs->point_col);
-          while (p >= buf && !isblank ((unsigned char) *p))
-            --p;
-          nextline = p + 1;     /* This will begin the next line.  */
-
-          if (nextline > buf)
-            {
-              /* Swallow separating blanks.  */
-              if (p >= buf)
-                do
-                  --p;
-                while (p >= buf && isblank ((unsigned char) *p));
-              nl = p + 1;       /* The newline will replace the first blank. */
-            }
-          else
-            {
-              /* A single word that is greater than the maximum line width.
-                 Oh well.  Put it on an overlong line by itself.  */
-              p = buf + (r + 1 - fs->point_col);
-              /* Find the end of the long word.  */
-              if (p < nl)
-                do
-                  ++p;
-                while (p < nl && !isblank ((unsigned char) *p));
-              if (p == nl)
-                {
-                  /* It already ends a line.  No fussing required.  */
-                  fs->point_col = 0;
-                  buf = nl + 1;
-                  continue;
-                }
-              /* We will move the newline to replace the first blank.  */
-              nl = p;
-              /* Swallow separating blanks.  */
-              do
-                ++p;
-              while (isblank ((unsigned char) *p));
-              /* The next line will start here.  */
-              nextline = p;
-            }
-
-          /* Note: There are a bunch of tests below for
-             NEXTLINE == BUF + LEN + 1; this case is where NL happens to fall
-             at the end of the buffer, and NEXTLINE is in fact empty (and so
-             we need not be careful to maintain its contents).  */
-
-          if ((nextline == buf + len + 1
-               ? fs->end - nl < fs->wmargin + 1
-               : nextline - (nl + 1) < fs->wmargin)
-              && fs->p > nextline)
-            {
-              /* The margin needs more blanks than we removed.  */
-              if (fs->end - fs->p > fs->wmargin + 1)
-                /* Make some space for them.  */
-                {
-                  size_t mv = fs->p - nextline;
-                  memmove (nl + 1 + fs->wmargin, nextline, mv);
-                  nextline = nl + 1 + fs->wmargin;
-                  len = nextline + mv - buf;
-                  *nl++ = '\n';
-                }
-              else
-                /* Output the first line so we can use the space.  */
-                {
-#ifdef _LIBC
-                  __fxprintf (fs->stream, "%.*s\n",
-                              (int) (nl - fs->buf), fs->buf);
-#else
-                  if (nl > fs->buf)
-                    fwrite_unlocked (fs->buf, 1, nl - fs->buf, fs->stream);
-                  putc_unlocked ('\n', fs->stream);
-#endif
-
-                  len += buf - fs->buf;
-                  nl = buf = fs->buf;
-                }
-            }
-          else
-            /* We can fit the newline and blanks in before
-               the next word.  */
-            *nl++ = '\n';
-
-          if (nextline - nl >= fs->wmargin
-              || (nextline == buf + len + 1 && fs->end - nextline >= 
fs->wmargin))
-            /* Add blanks up to the wrap margin column.  */
-            for (i = 0; i < fs->wmargin; ++i)
-              *nl++ = ' ';
-          else
-            for (i = 0; i < fs->wmargin; ++i)
-#ifdef _LIBC
-              if (_IO_fwide (fs->stream, 0) > 0)
-                putwc_unlocked (L' ', fs->stream);
-              else
-#endif
-                putc_unlocked (' ', fs->stream);
-
-          /* Copy the tail of the original buffer into the current buffer
-             position.  */
-          if (nl < nextline)
-            memmove (nl, nextline, buf + len - nextline);
-          len -= nextline - buf;
-
-          /* Continue the scan on the remaining lines in the buffer.  */
-          buf = nl;
-
-          /* Restore bufp to include all the remaining text.  */
-          fs->p = nl + len;
-
-          /* Reset the counter of what has been output this line.  If wmargin
-             is 0, we want to avoid the lmargin getting added, so we set
-             point_col to a magic value of -1 in that case.  */
-          fs->point_col = fs->wmargin ? fs->wmargin : -1;
+          fs->point_offs++;
         }
     }
+  fs->point_col = col;
 
-  /* Remember that we've scanned as far as the end of the buffer.  */
-  fs->point_offs = fs->p - fs->buf;
+  free (lns);
 }
 
 /* Ensure that FS has space for AMOUNT more bytes in its buffer, either by
diff --git a/modules/argp b/modules/argp
index 85eefe2b1..1a958b1f1 100644
--- a/modules/argp
+++ b/modules/argp
@@ -36,6 +36,7 @@ stdio
 strerror
 memchr
 memmove
+libunistring
 
 configure.ac:
 gl_ARGP
-- 
2.34.1


reply via email to

[Prev in Thread] Current Thread [Next in Thread]