bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 07/10] regex: fix longstanding backref match bug


From: Paul Eggert
Subject: [PATCH 07/10] regex: fix longstanding backref match bug
Date: Fri, 5 Feb 2021 17:25:59 -0800

This fixes a longstanding glibc bug concerning backreferences
<https://sourceware.org/11053> (2009-12-04).
* lib/regexec.c (proceed_next_node, push_fail_stack)
(pop_fail_stack): Push and pop the previous registers
as well as the current ones.  All callers changed.
(set_regs): Also pop if CUR_NODE has already been checked,
so that it does not get added as a duplicate set entry.
(update_regs): Fix comment location.
* tests/test-regex.c (tests): New constant.
(bug_regex11): New test function.
(main): Bump alarm value.  Call new test function.
---
 ChangeLog          |  13 ++++++
 lib/regexec.c      |  26 +++++++----
 tests/test-regex.c | 113 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 141 insertions(+), 11 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 74304474b..bd7d1fa16 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,18 @@
 2021-02-05  Paul Eggert  <eggert@cs.ucla.edu>
 
+       regex: fix longstanding backref match bug
+       This fixes a longstanding glibc bug concerning backreferences
+       <https://sourceware.org/11053> (2009-12-04).
+       * lib/regexec.c (proceed_next_node, push_fail_stack)
+       (pop_fail_stack): Push and pop the previous registers
+       as well as the current ones.  All callers changed.
+       (set_regs): Also pop if CUR_NODE has already been checked,
+       so that it does not get added as a duplicate set entry.
+       (update_regs): Fix comment location.
+       * tests/test-regex.c (tests): New constant.
+       (bug_regex11): New test function.
+       (main): Bump alarm value.  Call new test function.
+
        regex: avoid duplicate in espilon closure
        * lib/regcomp.c (calc_eclosure_iter): Insert NODE into epsilon
        closure first rather than last.  Otherwise, the epsilon closure
diff --git a/lib/regexec.c b/lib/regexec.c
index fdd2e373e..424bc8d15 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -59,7 +59,7 @@ static void update_regs (const re_dfa_t *dfa, regmatch_t 
*pmatch,
                         Idx cur_idx, Idx nmatch);
 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
                                      Idx str_idx, Idx dest_node, Idx nregs,
-                                     regmatch_t *regs,
+                                     regmatch_t *regs, regmatch_t *prevregs,
                                      re_node_set *eps_via_nodes);
 static reg_errcode_t set_regs (const regex_t *preg,
                               const re_match_context_t *mctx,
@@ -1211,6 +1211,7 @@ check_halt_state_context (const re_match_context_t *mctx,
 
 static Idx
 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
+                  regmatch_t *prevregs,
                   Idx *pidx, Idx node, re_node_set *eps_via_nodes,
                   struct re_fail_stack_t *fs)
 {
@@ -1243,7 +1244,7 @@ proceed_next_node (const re_match_context_t *mctx, Idx 
nregs, regmatch_t *regs,
              /* Otherwise, push the second epsilon-transition on the fail 
stack.  */
              else if (fs != NULL
                       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
-                                          eps_via_nodes))
+                                          prevregs, eps_via_nodes))
                return -2;
 
              /* We know we are going to exit.  */
@@ -1316,7 +1317,8 @@ proceed_next_node (const re_match_context_t *mctx, Idx 
nregs, regmatch_t *regs,
 static reg_errcode_t
 __attribute_warn_unused_result__
 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
-                Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
+                Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
+                re_node_set *eps_via_nodes)
 {
   reg_errcode_t err;
   Idx num = fs->num++;
@@ -1332,23 +1334,26 @@ push_fail_stack (struct re_fail_stack_t *fs, Idx 
str_idx, Idx dest_node,
     }
   fs->stack[num].idx = str_idx;
   fs->stack[num].node = dest_node;
-  fs->stack[num].regs = re_malloc (regmatch_t, nregs);
+  fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
   if (fs->stack[num].regs == NULL)
     return REG_ESPACE;
   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
+  memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
   return err;
 }
 
 static Idx
 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
-               regmatch_t *regs, re_node_set *eps_via_nodes)
+               regmatch_t *regs, regmatch_t *prevregs,
+               re_node_set *eps_via_nodes)
 {
   if (fs == NULL || fs->num == 0)
     return -1;
   Idx num = --fs->num;
   *pidx = fs->stack[num].idx;
   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
+  memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
   re_node_set_free (eps_via_nodes);
   re_free (fs->stack[num].regs);
   *eps_via_nodes = fs->stack[num].eps_via_nodes;
@@ -1408,7 +1413,8 @@ set_regs (const regex_t *preg, const re_match_context_t 
*mctx, size_t nmatch,
     {
       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
 
-      if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
+      if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
+         || re_node_set_contains (&eps_via_nodes, cur_node))
        {
          Idx reg_idx;
          cur_node = -1;
@@ -1418,7 +1424,7 @@ set_regs (const regex_t *preg, const re_match_context_t 
*mctx, size_t nmatch,
                if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
                  {
                    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
-                                              &eps_via_nodes);
+                                              prev_idx_match, &eps_via_nodes);
                    break;
                  }
            }
@@ -1431,7 +1437,8 @@ set_regs (const regex_t *preg, const re_match_context_t 
*mctx, size_t nmatch,
        }
 
       /* Proceed to next node.  */
-      cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
+      cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
+                                   &idx, cur_node,
                                    &eps_via_nodes, fs);
 
       if (__glibc_unlikely (cur_node < 0))
@@ -1443,7 +1450,8 @@ set_regs (const regex_t *preg, const re_match_context_t 
*mctx, size_t nmatch,
              free_fail_stack_return (fs);
              return REG_ESPACE;
            }
-         cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, &eps_via_nodes);
+         cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
+                                    prev_idx_match, &eps_via_nodes);
          if (cur_node < 0)
            {
              re_node_set_free (&eps_via_nodes);
diff --git a/tests/test-regex.c b/tests/test-regex.c
index 58f8200f2..a14619805 100644
--- a/tests/test-regex.c
+++ b/tests/test-regex.c
@@ -55,6 +55,111 @@ really_utf8 (void)
   return strcmp (locale_charset (), "UTF-8") == 0;
 }
 
+/* Tests supposed to match; copied from glibc posix/bug-regex11.c.  */
+static struct
+{
+  const char *pattern;
+  const char *string;
+  int flags, nmatch;
+  regmatch_t rm[5];
+} const tests[] = {
+  /* Test for newline handling in regex.  */
+  { "[^~]*~", "\nx~y", 0, 2, { { 0, 3 }, { -1, -1 } } },
+  /* Other tests.  */
+  { "a(.*)b", "a b", REG_EXTENDED, 2, { { 0, 3 }, { 1, 2 } } },
+  { ".*|\\([KIO]\\)\\([^|]*\\).*|?[KIO]", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
+    { { 0, 21 }, { 15, 16 }, { 16, 18 } } },
+  { ".*|\\([KIO]\\)\\([^|]*\\).*|?\\1", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
+    { { 0, 21 }, { 8, 9 }, { 9, 10 } } },
+  { "^\\(a*\\)\\1\\{9\\}\\(a\\{0,9\\}\\)\\([0-9]*;.*[^a]\\2\\([0-9]\\)\\)",
+    "a1;;0a1aa2aaa3aaaa4aaaaa5aaaaaa6aaaaaaa7aaaaaaaa8aaaaaaaaa9aa2aa1a0", 0,
+    5, { { 0, 67 }, { 0, 0 }, { 0, 1 }, { 1, 67 }, { 66, 67 } } },
+  /* Test for BRE expression anchoring.  POSIX says just that this may match;
+     in glibc regex it always matched, so avoid changing it.  */
+  { "\\(^\\|foo\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } },
+  { "\\(foo\\|^\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } },
+  /* In ERE this must be treated as an anchor.  */
+  { "(^|foo)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } },
+  { "(foo|^)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } },
+  /* Here ^ cannot be treated as an anchor according to POSIX.  */
+  { "(^|foo)bar", "(^|foo)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
+  { "(foo|^)bar", "(foo|^)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
+  /* More tests on backreferences.  */
+  { "()\\1", "x", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
+  { "()x\\1", "x", REG_EXTENDED, 2, { { 0, 1 }, { 0, 0 } } },
+  { "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
+  { "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 
} } },
+  { "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 
} } },
+  { "(b)()c\\1", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 1 }, { 1, 1 } } },
+  { "()(b)c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } },
+  { "a(b)()c\\1", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 2 }, { 2, 2 } } },
+  { "a()(b)c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } },
+  { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } },
+  { "(b())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 2 }, { 0, 1 }, { 1, 1 } } },
+  { "a()(b)\\1c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } 
},
+  { "a()d(b)\\1c\\2", "adbcb", REG_EXTENDED, 3, { { 0, 5 }, { 1, 1 }, { 2, 3 } 
} },
+  { "a(b())\\2\\1", "abbbb", REG_EXTENDED, 3, { { 0, 3 }, { 1, 2 }, { 2, 2 } } 
},
+  { "(bb())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 4 }, { 0, 2 }, { 2, 2 } } 
},
+  { "^([^,]*),\\1,\\1$", "a,a,a", REG_EXTENDED, 2, { { 0, 5 }, { 0, 1 } } },
+  { "^([^,]*),\\1,\\1$", "ab,ab,ab", REG_EXTENDED, 2, { { 0, 8 }, { 0, 2 } } },
+  { "^([^,]*),\\1,\\1,\\1$", "abc,abc,abc,abc", REG_EXTENDED, 2,
+    { { 0, 15 }, { 0, 3 } } },
+  { "^(.?)(.?)(.?)(.?)(.?).?\\5\\4\\3\\2\\1$",
+    "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
+  { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
+    "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
+  { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
+    "abcdedcba", REG_EXTENDED, 1, { { 0, 9 } } },
+  /* XXX Not used since they fail so far.  */
+  { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
+    "ababababa", REG_EXTENDED, 1, { { 0, 9 } } },
+  { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$",
+    "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
+  { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$",
+    "ababababa", REG_EXTENDED, 1, { { 0, 9 } } },
+};
+
+static void
+bug_regex11 (void)
+{
+  regex_t re;
+  regmatch_t rm[5];
+  size_t i;
+  int n;
+
+  for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i)
+    {
+      n = regcomp (&re, tests[i].pattern, tests[i].flags);
+      if (n != 0)
+       {
+         char buf[500];
+         regerror (n, &re, buf, sizeof (buf));
+         report_error ("%s: regcomp %zd failed: %s", tests[i].pattern, i, buf);
+         continue;
+       }
+
+      if (regexec (&re, tests[i].string, tests[i].nmatch, rm, 0))
+       {
+         report_error ("%s: regexec %zd failed", tests[i].pattern, i);
+         regfree (&re);
+         continue;
+       }
+
+      for (n = 0; n < tests[i].nmatch; ++n)
+       if (rm[n].rm_so != tests[i].rm[n].rm_so
+              || rm[n].rm_eo != tests[i].rm[n].rm_eo)
+         {
+           if (tests[i].rm[n].rm_so == -1 && tests[i].rm[n].rm_eo == -1)
+             break;
+           report_error ("%s: regexec %zd match failure rm[%d] %d..%d",
+                          tests[i].pattern, i, n, rm[n].rm_so, rm[n].rm_eo);
+           break;
+         }
+
+      regfree (&re);
+    }
+}
+
 int
 main (void)
 {
@@ -65,11 +170,15 @@ main (void)
   struct re_registers regs;
 
 #if HAVE_DECL_ALARM
-  /* Some builds of glibc go into an infinite loop on this test.  */
-  int alarm_value = 2;
+  /* In case a bug causes glibc to go into an infinite loop.
+     The tests should take less than 10 s on a reasonably modern CPU.  */
+  int alarm_value = 1000;
   signal (SIGALRM, SIG_DFL);
   alarm (alarm_value);
 #endif
+
+  bug_regex11 ();
+
   if (setlocale (LC_ALL, "en_US.UTF-8"))
     {
       {
-- 
2.27.0




reply via email to

[Prev in Thread] Current Thread [Next in Thread]