bison-patches
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

maint: diagnostics: beware of zero-width characters


From: Akim Demaille
Subject: maint: diagnostics: beware of zero-width characters
Date: Sun, 8 Mar 2020 08:02:06 +0100

commit f438962a6d46c0964649bdf6a88fd419ec9b62a9
Author: Akim Demaille <address@hidden>
Date:   Sat Mar 7 12:59:09 2020 +0100

    diagnostics: beware of zero-width characters
    
    Currenly we rely on (visual) width of the characters to decide where
    to open and close the styling of the quoted lines.  This breaks when
    we deal with zero-width characters: we cannot just rely on (visual)
    columns, we need to know whether we are before, inside, or after the
    highlighted portion.
    
    * src/location.c (location_caret): col_end: no longer add 1, "regular"
    characters have a width of 1, only 0-width characters have 0-width.
    opened: replace with 'state', a three-valued enum.
    Don't reopen the style if we already did.
    * tests/diagnostics.at (Zero-width characters): New.

diff --git a/src/location.c b/src/location.c
index dbcd67ec..9f929c00 100644
--- a/src/location.c
+++ b/src/location.c
@@ -421,12 +421,14 @@ location_caret (location loc, const char *style, FILE 
*out)
       {
         /* The last column to highlight.  Only the first line of
            multiline locations are quoted, in which case the ending
-           column is the end of line.  Single point locations (with
-           equal boundaries) denote the character that they
-           follow.  */
-        int col_end
+           column is the end of line.
+
+           We used to work with byte offsets, and that was much
+           easier.  However, we went back to using (visual) columns to
+           support truncating of long lines.  */
+        const int col_end
           = loc.start.line == loc.end.line
-          ? loc.end.column + (loc.start.column == loc.end.column)
+          ? loc.end.column
           : caret_info.line_len;
         /* Quote the file (at most the first line in the case of
            multiline locations).  */
@@ -436,24 +438,28 @@ location_caret (location loc, const char *style, FILE 
*out)
              expected (maybe the file was changed since the scanner
              ran), we might reach the end before we actually saw the
              opening column.  */
-          bool opened = false;
+          enum { before, inside, after } state = before;
           while (!mb_iseof (c) && !mb_iseq (c, '\n'))
             {
-              if (caret_info.pos.column == loc.start.column)
+              // We might have already opened (and even closed!) the
+              // style and yet have the equality of the columns if we
+              // just saw zero-width characters.
+              if (state == before
+                  && caret_info.pos.column == loc.start.column)
                 {
                   begin_use_class (style, out);
-                  opened = true;
+                  state = inside;
                 }
               if (skip < caret_info.pos.column)
                 mb_putc (c, out);
               boundary_compute (&caret_info.pos, mb_ptr (c), mb_len (c));
               caret_getc (c);
-              if (opened
+              if (state == inside
                   && (caret_info.pos.column == col_end
                       || width < caret_info.pos.column - skip))
                 {
                   end_use_class (style, out);
-                  opened = false;
+                  state = after;
                 }
               if (width < caret_info.pos.column - skip)
                 {
@@ -461,11 +467,11 @@ location_caret (location loc, const char *style, FILE 
*out)
                   break;
                 }
             }
-          // The line is shorter than expected.
-          if (opened)
+          if (state == inside)
             {
+              // The line is shorter than expected.
               end_use_class (style, out);
-              opened = false;
+              state = after;
             }
           putc ('\n', out);
         }
diff --git a/src/location.h b/src/location.h
index ccb42e3c..2abe438b 100644
--- a/src/location.h
+++ b/src/location.h
@@ -42,16 +42,14 @@ typedef struct
 
   /* If positive, the column (starting at 1) just after the boundary.
      This is neither a byte count, nor a character count; it is a
-     column count.  If this is INT_MAX, the column number has
+     (visual) column count.  If this is INT_MAX, the column number has
      overflowed.
-
-     Meaningless and not displayed if nonpositive.
-  */
+     
+     Meaningless and not displayed if nonpositive.  */
   int column;
 
-  /* If nonnegative, the byte number (starting at 0) in the current line.
-     Never displayed, used when printing error messages with colors to
-     know where colors start and end.  */
+  /* If nonnegative, the byte number (starting at 0) in the current
+     line.  Not displayed (unless --trace=location).  */
   int byte;
 
 } boundary;
diff --git a/tests/diagnostics.at b/tests/diagnostics.at
index cbf56b77..1471934f 100644
--- a/tests/diagnostics.at
+++ b/tests/diagnostics.at
@@ -37,15 +37,15 @@ AT_BISON_OPTION_PUSHDEFS
 
 AT_DATA_GRAMMAR([[input.y]], [$2])
 
+AT_DATA([experr], [$4])
+
 # For some reason, literal ^M in the input are removed and don't end
 # in `input.y`.  So use the two-character ^M represent it, and let
 # Perl insert real CR characters.
-if grep '\^M' input.y >/dev/null; then
-  AT_PERL_REQUIRE([-pi -e 's{\^M}{\r}gx' input.y])
+if $EGREP ['\^M|\\[0-9][0-9][0-9]'] input.y experr >/dev/null; then
+  AT_PERL_REQUIRE([-pi -e 's{\^M}{\r}g;s{\\(\d{3}|.)}{$v = $[]1; $v =~ 
/\A\d+\z/ ? chr($v) : $v}ge' input.y experr])
 fi
 
-AT_DATA([experr], [$4])
-
 AT_CHECK([LC_ALL="$locale" $5 bison -fcaret --color=debug -Wall input.y], 
[$3], [], [experr])
 
 # When no style, same messages, but without style.
@@ -193,6 +193,24 @@ input.y:12.8-10:     previous declaration
 ]])
 
 
+## ----------------------- ##
+## Zero-width characters.  ##
+## ----------------------- ##
+
+# We used to open twice the styling for characters that have a
+# zero-width on display (e.g., \005).
+
+AT_TEST([[Zero-width characters]],
+[[%%
+exp: an\005error.
+]],
+[1],
+[[input.y:10.8: <error>error:</error> invalid character: '\\005'
+   10 | exp: an<error>\005</error>error.
+      |        <error>^</error>
+]])
+
+
 ## -------------------------------------- ##
 ## Tabulations and multibyte characters.  ##
 ## -------------------------------------- ##




reply via email to

[Prev in Thread] Current Thread [Next in Thread]