[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 2/3] dfa: new option DFA_STRAY_BACKSLASH_WARN
From: |
Paul Eggert |
Subject: |
[PATCH 2/3] dfa: new option DFA_STRAY_BACKSLASH_WARN |
Date: |
Mon, 23 May 2022 12:19:11 -0700 |
This is for grep, which wants to warn about stray backslashes that
lead to unspecified behavior. For example, "grep -oi '\a'"
surprisingly is not equivalent to "grep -oi 'a'", so the stray
backslash should be warned about.
* lib/dfa.c: Include wctype.h, for iswprint and iswspace.
(lex): Add support for DFA_STRAY_BACKSLASH_WARN.
* lib/dfa.h (DFA_STRAY_BACKSLASH_WARN): New constant.
---
ChangeLog | 9 ++++
lib/dfa.c | 120 ++++++++++++++++++++++++++++++++++++------------------
lib/dfa.h | 3 ++
3 files changed, 93 insertions(+), 39 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 407baca335..0c5e799521 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,14 @@
2022-05-23 Paul Eggert <eggert@cs.ucla.edu>
+ dfa: new option DFA_STRAY_BACKSLASH_WARN
+ This is for grep, which wants to warn about stray backslashes that
+ lead to unspecified behavior. For example, "grep -oi '\a'"
+ surprisingly is not equivalent to "grep -oi 'a'", so the stray
+ backslash should be warned about.
+ * lib/dfa.c: Include wctype.h, for iswprint and iswspace.
+ (lex): Add support for DFA_STRAY_BACKSLASH_WARN.
+ * lib/dfa.h (DFA_STRAY_BACKSLASH_WARN): New constant.
+
dfa: new option DFA_CONFUSING_BRACKETS_ERROR
This is for grep, which wants [:alpha:] to be an error
at the top level.
diff --git a/lib/dfa.c b/lib/dfa.c
index ba21639521..4833a20d72 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -59,6 +59,7 @@ c_isdigit (char c)
#define _(str) gettext (str)
#include <wchar.h>
+#include <wctype.h>
#include "xalloc.h"
#include "localeinfo.h"
@@ -1192,8 +1193,7 @@ lex (struct dfa *dfa)
we set the backslash flag and go through the loop again.
On the plus side, this avoids having a duplicate of the
main switch inside the backslash case. On the minus side,
- it means that just about every case begins with
- "if (backslash) ...". */
+ it means that just about every case tests the backslash flag. */
for (int i = 0; i < 2; ++i)
{
if (! dfa->lex.left)
@@ -1248,52 +1248,67 @@ lex (struct dfa *dfa)
case '7':
case '8':
case '9':
- if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
- {
- dfa->lex.laststart = false;
- return dfa->lex.lasttok = BACKREF;
- }
- goto normal_char;
+ if (!backslash)
+ goto normal_char;
+ if (dfa->syntax.syntax_bits & RE_NO_BK_REFS)
+ goto stray_backslash;
+
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = BACKREF;
case '`':
- if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- {
- /* FIXME: should be beginning of string */
- return dfa->lex.lasttok = BEGLINE;
- }
- goto normal_char;
+ if (!backslash)
+ goto normal_char;
+ if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
+ goto stray_backslash;
+
+ /* FIXME: should be beginning of string */
+ return dfa->lex.lasttok = BEGLINE;
case '\'':
- if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- {
- /* FIXME: should be end of string */
- return dfa->lex.lasttok = ENDLINE;
- }
- goto normal_char;
+ if (!backslash)
+ goto normal_char;
+ if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
+ goto stray_backslash;
+
+ /* FIXME: should be end of string */
+ return dfa->lex.lasttok = ENDLINE;
case '<':
- if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lex.lasttok = BEGWORD;
- goto normal_char;
+ if (!backslash)
+ goto normal_char;
+ if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
+ goto stray_backslash;
+
+ return dfa->lex.lasttok = BEGWORD;
case '>':
- if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lex.lasttok = ENDWORD;
- goto normal_char;
+ if (!backslash)
+ goto normal_char;
+ if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
+ goto stray_backslash;
+
+ return dfa->lex.lasttok = ENDWORD;
case 'b':
- if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lex.lasttok = LIMWORD;
- goto normal_char;
+ if (!backslash)
+ goto normal_char;
+ if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
+ goto stray_backslash;
+
+ return dfa->lex.lasttok = LIMWORD;
case 'B':
- if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lex.lasttok = NOTLIMWORD;
- goto normal_char;
+ if (!backslash)
+ goto normal_char;
+ if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
+ goto stray_backslash;
+
+ return dfa->lex.lasttok = NOTLIMWORD;
case '?':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
- goto normal_char;
+ goto default_case;
if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
@@ -1311,7 +1326,7 @@ lex (struct dfa *dfa)
case '+':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
- goto normal_char;
+ goto default_case;
if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
@@ -1321,7 +1336,7 @@ lex (struct dfa *dfa)
case '{':
if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
- goto normal_char;
+ goto default_case;
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
@@ -1379,7 +1394,7 @@ lex (struct dfa *dfa)
case '|':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
- goto normal_char;
+ goto default_case;
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
goto normal_char;
dfa->lex.laststart = true;
@@ -1387,7 +1402,9 @@ lex (struct dfa *dfa)
case '\n':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
- || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
+ || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
+ goto default_case;
+ if (backslash)
goto normal_char;
dfa->lex.laststart = true;
return dfa->lex.lasttok = OR;
@@ -1433,8 +1450,11 @@ lex (struct dfa *dfa)
case 's':
case 'S':
- if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
+ if (!backslash)
goto normal_char;
+ if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
+ goto stray_backslash;
+
if (!dfa->localeinfo.multibyte)
{
charclass ccl;
@@ -1466,8 +1486,10 @@ lex (struct dfa *dfa)
case 'w':
case 'W':
- if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
+ if (!backslash)
goto normal_char;
+ if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
+ goto stray_backslash;
if (!dfa->localeinfo.multibyte)
{
@@ -1505,6 +1527,26 @@ lex (struct dfa *dfa)
return dfa->lex.lasttok = parse_bracket_exp (dfa);
default:
+ default_case:
+ if (!backslash)
+ goto normal_char;
+ stray_backslash:
+ if (dfa->syntax.dfaopts & DFA_STRAY_BACKSLASH_WARN)
+ {
+ char const *msg;
+ char msgbuf[100];
+ if (!iswprint (dfa->lex.wctok))
+ msg = _("stray \\ before unprintable character");
+ else if (iswspace (dfa->lex.wctok))
+ msg = _("stray \\ before white space");
+ else
+ {
+ int n = snprintf (msgbuf, sizeof msgbuf,
+ _("stray \\ before %lc"), dfa->lex.wctok);
+ msg = 0 <= n && n < sizeof msgbuf ? msgbuf : _("stray \\");
+ }
+ dfawarn (msg);
+ }
normal_char:
dfa->lex.laststart = false;
/* For multibyte character sets, folding is done in atom. Always
diff --git a/lib/dfa.h b/lib/dfa.h
index 327b9c7cdf..8674929e90 100644
--- a/lib/dfa.h
+++ b/lib/dfa.h
@@ -78,6 +78,9 @@ enum
/* Treat [:alpha:] etc. as an error at the top level, instead of
merely a warning. */
DFA_CONFUSING_BRACKETS_ERROR = 1 << 2,
+
+ /* Warn about stray backslashes before ordinary characters. */
+ DFA_STRAY_BACKSLASH_WARN = 1 << 3,
};
/* Initialize or reinitialize a DFA. The arguments are:
--
2.36.1