bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: regex-quote.c syntax support


From: Bruno Haible
Subject: Re: regex-quote.c syntax support
Date: Sun, 6 Mar 2011 14:41:19 +0100
User-agent: KMail/1.9.9

Hello Reuben,

> > Before we can decide on this, IMO some analysis is needed:
> >
> >  - What are the possible effects of reg_syntax_t on the string of
> >    characters to be escaped? I can see
> >      RE_BK_PLUS_QM                   ->    +?
> >      RE_INTERVALS, RE_NO_BK_BRACES   ->    {}
> >    What other relations are there?
> 
> RE_NO_BK_PARENS -> ()
> RE_NO_BK_VBAR -> |

Yup, thanks.

> RE_NO_BK_REFS -> [:digit:]

I don't know what you mean by that? '[' and ']' are already in the list of
characters to be escaped. So no need to look at RE_NO_BK_REFS, right?

> >  - What characters need to be escaped in PCRE syntax?
> 
> According to pcrepattern(3):
> 
> ^$.[|()?*+{

Thanks. I'll add ] and } for symmetry.

> >  - Do Emacs and PCRE view a regex as a sequence of bytes or as a sequence
> >    of multibyte characters in the locale encoding (given by LC_CTYPE)?
> 
> PCRE doesn't do locales; it treats strings as either bytes or, given a
> specific flag, UTF-8.

Weird! This means that the regex_quote task also needs to work on bytes
when PCRE syntax is requested.

> I don't really understand the question about Emacs: someone using
> regex-quote in their own programs is worried about Emacs syntax

There are two possible uses of regex_quote with EMACS syntax:
  - if your program wants to call re_compile_pattern,
  - if your program wants to pass such a regular expression to Emacs via
    command-line invocations or similar.
In the first case, the result should be in locale encoding. In the second
case, maybe not.

How about this proposed API?


2011-03-06  Bruno Haible  <address@hidden>

        regex-quote: New API.
        * lib/regex-quote.h: Include <stdbool.h>.
        (struct regex_quote_spec): New type.
        (regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre):
        New declarations.
        (regex_quote_length, regex_quote_copy, regex_quote): Take a
        'const struct regex_quote_spec *' argument.
        * lib/regex-quote.c (RE_*, PCRE_*): New macros.
        (pcre_special): New constant.
        (regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre):
        New functions.
        (regex_quote_length, regex_quote_copy, regex_quote): Take a
        'const struct regex_quote_spec *' argument.
        * modules/regex-quote (Depends-on): Add stdbool.
        * tests/test-regex-quote.c (check): Update for new API. Add test for
        anchored results.
        * NEWS: Mention the API change.
        Reported by Reuben Thomas and Eric Blake.

*** NEWS.orig   Sun Mar  6 14:37:53 2011
--- NEWS        Sun Mar  6 14:26:31 2011
***************
*** 12,17 ****
--- 12,21 ----
  
  Date        Modules         Changes
  
+ 2011-03-06  regex-quote     The last argument is no longer an 'int cflags'
+                             but instead a pointer to a previously constructed
+                             'struct regex_quote_spec'.
+ 
  2011-02-25  dirname         These modules no longer put #defines for the
              dirname-lgpl    following symbols into <config.h>: ISSLASH,
              backupfile      FILE_SYSTEM_ACCEPTS_DRIVE_LETTER_PREFIX,
*** lib/regex-quote.h.orig      Sun Mar  6 14:37:53 2011
--- lib/regex-quote.h   Sun Mar  6 14:26:31 2011
***************
*** 15,41 ****
     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  
  #include <stddef.h>
  
! /* regex_quote converts a literal string to a regular expression that will
!    look for this literal string.
!    cflags can be 0 or REG_EXTENDED.
     If it is 0, the result is a Basic Regular Expression (BRE)
     
<http://www.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_03>.
     If it is REG_EXTENDED, the result is an Extended Regular Expression (ERE)
     
<http://www.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04>.
!    The result is not anchored; if you want it to match only complete lines,
!    you need to add "^" at the beginning of the result and "$" at the end of 
the
!    result.
!  */
  
  /* Returns the number of bytes needed for the quoted string.  */
! extern size_t regex_quote_length (const char *string, int cflags);
  
  /* Copies the quoted string to p and returns the incremented p.
!    There must be room for regex_quote_length (string, cflags) + 1 bytes at p.
!  */
! extern char * regex_quote_copy (char *p, const char *string, int cflags);
  
  /* Returns the freshly allocated quoted string.  */
! extern char * regex_quote (const char *string, int cflags);
--- 15,87 ----
     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  
+ #ifndef _REGEX_QUOTE_H
+ #define _REGEX_QUOTE_H
+ 
  #include <stddef.h>
+ #include <stdbool.h>
+ 
+ 
+ /* Specifies a quotation task for converting a fixed string to a regular
+    expression pattern.  */
+ struct regex_quote_spec
+ {
+   /* True if the regular expression pattern consists of multibyte characters,
+      false if it consists of single bytes or UTF-8 characters.  */
+   unsigned int /*bool*/ multibyte : 1;
+   /* True if the regular expression pattern shall match only entire lines.  */
+   unsigned int /*bool*/ anchored : 1;
+   /* Set of characters that need to be escaped (all ASCII), as a
+      NUL-terminated string.  */
+   char special[30 + 1];
+ };
  
! 
! /* Creates a quotation task that produces a POSIX regular expression, that is,
!    a pattern that can be compiled with regcomp().
!    CFLAGS can be 0 or REG_EXTENDED.
     If it is 0, the result is a Basic Regular Expression (BRE)
     
<http://www.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_03>.
     If it is REG_EXTENDED, the result is an Extended Regular Expression (ERE)
     
<http://www.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04>.
!    If ANCHORED is false, the regular expression will match substrings of 
lines.
!    If ANCHORED is true, it will match only complete lines,  */
! extern struct regex_quote_spec
!        regex_quote_spec_posix (int cflags, bool anchored);
! 
! /* Creates a quotation task that produces a regular expression that can be
!    compiled with the GNU API function re_compile_pattern().
!    SYNTAX describes the syntax of the regular expression (such as
!    RE_SYNTAX_POSIX_BASIC, RE_SYNTAX_POSIX_EXTENDED, RE_SYNTAX_EMACS, all
!    defined in <regex.h>).  It must be the same value as 're_syntax_options'
!    at the moment of the re_compile_pattern() call.
!    If ANCHORED is false, the regular expression will match substrings of 
lines.
!    If ANCHORED is true, it will match only complete lines,  */
! extern struct regex_quote_spec
!        regex_quote_spec_gnu (unsigned long /*reg_syntax_t*/ syntax, bool 
anchored);
! 
! /* Creates a quotation task that produces a PCRE regular expression, that is,
!    a pattern that can be compiled with pcre_compile().
!    OPTIONS is the same value as the second argument passed to pcre_compile().
!    If ANCHORED is false, the regular expression will match substrings of 
lines.
!    If ANCHORED is true, it will match only complete lines,  */
! extern struct regex_quote_spec
!        regex_quote_spec_pcre (int options, bool anchored);
! 
  
  /* Returns the number of bytes needed for the quoted string.  */
! extern size_t
!        regex_quote_length (const char *string, const struct regex_quote_spec 
*spec);
  
  /* Copies the quoted string to p and returns the incremented p.
!    There must be room for regex_quote_length (string, spec) + 1 bytes at p.  
*/
! extern char *
!        regex_quote_copy (char *p,
!                          const char *string, const struct regex_quote_spec 
*spec);
  
  /* Returns the freshly allocated quoted string.  */
! extern char *
!        regex_quote (const char *string, const struct regex_quote_spec *spec);
! 
! 
! #endif /* _REGEX_QUOTE_H */
*** lib/regex-quote.c.orig      Sun Mar  6 14:37:53 2011
--- lib/regex-quote.c   Sun Mar  6 14:26:58 2011
***************
*** 31,86 ****
  /* Characters that are special in an ERE.  */
  static const char ere_special[] = "$^.*[]\\+?{}()|";
  
  size_t
! regex_quote_length (const char *string, int cflags)
  {
!   const char *special = (cflags != 0 ? ere_special : bre_special);
    size_t length;
-   mbui_iterator_t iter;
  
    length = 0;
!   for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
      {
!       /* We know that special contains only ASCII characters.  */
!       if (mb_len (mbui_cur (iter)) == 1
!           && strchr (special, * mbui_cur_ptr (iter)))
!         length += 1;
!       length += mb_len (mbui_cur (iter));
      }
    return length;
  }
  
- /* Copies the quoted string to p and returns the incremented p.
-    There must be room for regex_quote_length (string, cflags) + 1 bytes at p.
-  */
  char *
! regex_quote_copy (char *p, const char *string, int cflags)
  {
!   const char *special = (cflags != 0 ? ere_special : bre_special);
!   mbui_iterator_t iter;
  
!   for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
      {
!       /* We know that special contains only ASCII characters.  */
!       if (mb_len (mbui_cur (iter)) == 1
!           && strchr (special, * mbui_cur_ptr (iter)))
!         *p++ = '\\';
!       memcpy (p, mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
!       p += mb_len (mbui_cur (iter));
      }
    return p;
  }
  
- /* Returns the freshly allocated quoted string.  */
  char *
! regex_quote (const char *string, int cflags)
  {
!   size_t length = regex_quote_length (string, cflags);
    char *result = XNMALLOC (length + 1, char);
    char *p;
  
    p = result;
!   p = regex_quote_copy (p, string, cflags);
    *p = '\0';
    return result;
  }
--- 31,216 ----
  /* Characters that are special in an ERE.  */
  static const char ere_special[] = "$^.*[]\\+?{}()|";
  
+ struct regex_quote_spec
+ regex_quote_spec_posix (int cflags, bool anchored)
+ {
+   struct regex_quote_spec result;
+ 
+   strcpy (result.special, cflags != 0 ? ere_special : bre_special);
+   result.multibyte = true;
+   result.anchored = anchored;
+ 
+   return result;
+ }
+ 
+ /* Syntax bit values, defined in GNU <regex.h>.  We don't include it here,
+    otherwise this module would need to depend on gnulib module 'regex'.  */
+ #define RE_BK_PLUS_QM    0x00000002
+ #define RE_INTERVALS     0x00000200
+ #define RE_LIMITED_OPS   0x00000400
+ #define RE_NEWLINE_ALT   0x00000800
+ #define RE_NO_BK_BRACES  0x00001000
+ #define RE_NO_BK_PARENS  0x00002000
+ #define RE_NO_BK_VBAR    0x00008000
+ 
+ struct regex_quote_spec
+ regex_quote_spec_gnu (unsigned long /*reg_syntax_t*/ syntax, bool anchored)
+ {
+   struct regex_quote_spec result;
+   char *p;
+ 
+   p = result.special;
+   memcpy (p, bre_special, sizeof (bre_special) - 1);
+   p += sizeof (bre_special) - 1;
+   if ((syntax & RE_LIMITED_OPS) == 0 && (syntax & RE_BK_PLUS_QM) == 0)
+     {
+       *p++ = '+';
+       *p++ = '?';
+     }
+   if ((syntax & RE_INTERVALS) != 0 && (syntax & RE_NO_BK_BRACES) != 0)
+     {
+       *p++ = '{';
+       *p++ = '}';
+     }
+   if ((syntax & RE_NO_BK_PARENS) != 0)
+     {
+       *p++ = '(';
+       *p++ = ')';
+     }
+   if ((syntax & RE_LIMITED_OPS) == 0 && (syntax & RE_NO_BK_VBAR) != 0)
+     *p++ = '|';
+   if ((syntax & RE_NEWLINE_ALT) != 0)
+     *p++ = '\n';
+   *p = '\0';
+ 
+   result.multibyte = true;
+   result.anchored = anchored;
+ 
+   return result;
+ }
+ 
+ /* Characters that are special in a PCRE.  */
+ static const char pcre_special[] = "$^.*[]\\+?{}()|";
+ 
+ /* Options bit values, defined in <pcre.h>.  We don't include it here, because
+    it is not a standard header.  */
+ #define PCRE_ANCHORED 0x00000010
+ #define PCRE_EXTENDED 0x00000008
+ 
+ struct regex_quote_spec
+ regex_quote_spec_pcre (int options, bool anchored)
+ {
+   struct regex_quote_spec result;
+   char *p;
+ 
+   p = result.special;
+   memcpy (p, bre_special, sizeof (pcre_special) - 1);
+   p += sizeof (pcre_special) - 1;
+   if (options & PCRE_EXTENDED)
+     {
+       *p++ = ' ';
+       *p++ = '\t';
+       *p++ = '\n';
+       *p++ = '\v';
+       *p++ = '\f';
+       *p++ = '\r';
+       *p++ = '#';
+     }
+   *p = '\0';
+ 
+   /* PCRE regular expressions consist of UTF-8 characters of options contains
+      PCRE_UTF8 and of single bytes otherwise.  */
+   result.multibyte = false;
+   /* If options contains PCRE_ANCHORED, the anchoring is implicit.  */
+   result.anchored = (options & PCRE_ANCHORED ? 0 : anchored);
+ 
+   return result;
+ }
+ 
  size_t
! regex_quote_length (const char *string, const struct regex_quote_spec *spec)
  {
!   const char *special = spec->special;
    size_t length;
  
    length = 0;
!   if (spec->anchored)
!     length += 2; /* for '^' at the beginning and '$' at the end */
!   if (spec->multibyte)
!     {
!       mbui_iterator_t iter;
! 
!       for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
!         {
!           /* We know that special contains only ASCII characters.  */
!           if (mb_len (mbui_cur (iter)) == 1
!               && strchr (special, * mbui_cur_ptr (iter)))
!             length += 1;
!           length += mb_len (mbui_cur (iter));
!         }
!     }
!   else
      {
!       const char *iter;
! 
!       for (iter = string; *iter != '\0'; iter++)
!         {
!           if (strchr (special, *iter))
!             length += 1;
!           length += 1;
!         }
      }
+ 
    return length;
  }
  
  char *
! regex_quote_copy (char *p, const char *string, const struct regex_quote_spec 
*spec)
  {
!   const char *special = spec->special;
  
!   if (spec->anchored)
!     *p++ = '^';
!   if (spec->multibyte)
      {
!       mbui_iterator_t iter;
! 
!       for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
!         {
!           /* We know that special contains only ASCII characters.  */
!           if (mb_len (mbui_cur (iter)) == 1
!               && strchr (special, * mbui_cur_ptr (iter)))
!             *p++ = '\\';
!           memcpy (p, mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
!           p += mb_len (mbui_cur (iter));
!         }
      }
+   else
+     {
+       const char *iter;
+ 
+       for (iter = string; *iter != '\0'; iter++)
+         {
+           if (strchr (special, *iter))
+             *p++ = '\\';
+           *p++ = *iter++;
+         }
+     }
+   if (spec->anchored)
+     *p++ = '$';
+ 
    return p;
  }
  
  char *
! regex_quote (const char *string, const struct regex_quote_spec *spec)
  {
!   size_t length = regex_quote_length (string, spec);
    char *result = XNMALLOC (length + 1, char);
    char *p;
  
    p = result;
!   p = regex_quote_copy (p, string, spec);
    *p = '\0';
    return result;
  }
*** modules/regex-quote.orig    Sun Mar  6 14:37:53 2011
--- modules/regex-quote Sun Mar  6 14:26:31 2011
***************
*** 6,11 ****
--- 6,12 ----
  lib/regex-quote.c
  
  Depends-on:
+ stdbool
  xalloc
  mbuiter
  
*** tests/test-regex-quote.c.orig       Sun Mar  6 14:37:53 2011
--- tests/test-regex-quote.c    Sun Mar  6 14:26:31 2011
***************
*** 29,46 ****
  static void
  check (const char *literal, int cflags, const char *expected)
  {
    char *result;
    size_t length;
  
!   result = regex_quote (literal, cflags);
    ASSERT (strcmp (result, expected) == 0);
!   length = regex_quote_length (literal, cflags);
    ASSERT (length == strlen (result));
    free (result);
  
    result = (char *) xmalloc (1 + length + 1 + 1);
    result[0] = '^';
!   strcpy (regex_quote_copy (result + 1, literal, cflags), "$");
    {
      regex_t regex;
      regmatch_t match[1];
--- 29,65 ----
  static void
  check (const char *literal, int cflags, const char *expected)
  {
+   struct regex_quote_spec spec;
    char *result;
    size_t length;
  
!   spec = regex_quote_spec_posix (cflags, false);
!   result = regex_quote (literal, &spec);
    ASSERT (strcmp (result, expected) == 0);
!   length = regex_quote_length (literal, &spec);
    ASSERT (length == strlen (result));
    free (result);
  
    result = (char *) xmalloc (1 + length + 1 + 1);
    result[0] = '^';
!   strcpy (regex_quote_copy (result + 1, literal, &spec), "$");
!   {
!     regex_t regex;
!     regmatch_t match[1];
! 
!     ASSERT (regcomp (&regex, result, cflags) == 0);
! 
!     ASSERT (regexec (&regex, literal, 1, match, 0) == 0);
!     ASSERT (match[0].rm_so == 0);
!     ASSERT (match[0].rm_eo == strlen (literal));
!     regfree (&regex);
!   }
!   free (result);
! 
!   spec = regex_quote_spec_posix (cflags, true);
!   result = regex_quote (literal, &spec);
!   length = regex_quote_length (literal, &spec);
!   ASSERT (length == strlen (result));
    {
      regex_t regex;
      regmatch_t match[1];
-- 
In memoriam Marie Politzer <http://fr.wikipedia.org/wiki/Marie_Politzer>



reply via email to

[Prev in Thread] Current Thread [Next in Thread]