bug-bash
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

PATCH to bash 2.05b: in-process regexp matching: [[ string =~ regexp ]]


From: Francis Litterio
Subject: PATCH to bash 2.05b: in-process regexp matching: [[ string =~ regexp ]]
Date: Sat, 03 Jan 2004 09:16:34 -0500
User-agent: Gnus/5.1002 (Gnus v5.10.2) Emacs/21.3 (windows-nt)

Bash developers (and other interested parties),

The below patch to bash-2.05b enables in-process extended regular
expression matching (e.g., egrep-style regexps) using this syntax:

   [[ string =~ regexp ]]

Substrings that are matched by parenthesized subexpressions within the
regexp are saved in the array RESUBEXP.

Example:

   bash$ [[ foobarbletchzzz =~ '(foo)(bar)(bletch)' ]] && echo ${RESUBEXP[*]}
   foobarbletch foo bar bletch

Element 0 of the array RESUBEXP is the part of the string that matched
the entire regular expression.  Elements 1, 2, etc. contain the parts
that match the 1st, 2nd, etc. parenthesized subexpression.

If the regexp is syntactically incorrect, the status is 2.  If the
regexp is syntactically correct but does not match, the status is 1.  If
the regexp is syntactically correct and matches, the status is 0.

If shell option nocaseglob is enabled, the regexp is matched
case-insensitively, otherwise it is case-sensitive.

The precedence of operator =~ is the same as == and !=.  Operator =~ is
non-associative (i.e., this is a syntax error: [[ x =~ y =~ z ]]).

More examples:

1. Variables can hold the string and regexp:

   bash$ string="foobarbletch"
   bash$ regexp="(fo+)(b.*)(b.*)"
   bash$ [[ $string =~ $regexp ]] && echo ${RESUBEXP[*]}
   foobarbletch foo bar bletch

2. The =~ can be mixed with other [[ ... ]] operators:

   [[ foobar =~ foobar && $SHLVL -lt 10 ]] && echo yes
   yes

3. Array RESUBEXP can be referenced within the same [[ ... ]] command
   following any match:

   bash$ [[ foobar22 =~ '(foo)bar(.*)' && ${RESUBEXP[2]} -eq 22 ]] && echo yes
   yes

4. Array RESUBEXP can hold lots of matched subexpressions:

   [[ abcdefghijklmnopqrstuvwxyz =~ \
      '(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)' ]] && \
      echo ${RESUBEXP[*]}
   abcdefghijklmnopqrs a b c d e f g h i j k l m n o p q r s

5. Empty and syntactically incorrect regexps are invalid but don't cause
   a shell syntax error:

   bash$ [[ foobar =~ '' ]]; echo $?
   2
   bash$ [[ ! foobar =~ '' ]] && echo well well
   well well
   bash$ [[ ! foobar =~ '(foo' ]] && echo well well
   well well

6. Use shell option nocaseglob to control case-sensitivity:

   bash$ shopt -u nocaseglob
   bash$ [[ foo =~ F.O ]] && echo yes
   bash$ shopt -s nocaseglob
   bash$ [[ foo =~ F.O ]] && echo yes
   yes

I hope the Bash community finds this functionality useful.
--
Francis Litterio
address@hidden
http://world.std.com/~franl/
GPG and PGP public keys available on keyservers.


diff -ruw bash-2.05b/execute_cmd.c bash-2.05b-franl/execute_cmd.c
--- bash-2.05b/execute_cmd.c    2002-03-18 13:24:22.000000000 -0500
+++ bash-2.05b-franl/execute_cmd.c      2003-07-17 09:51:34.000000000 -0400
@@ -24,6 +24,7 @@
 #endif /* _AIX && RISC6000 && !__GNUC__ */
 
 #include <stdio.h>
+#include <regex.h>
 #include "chartypes.h"
 #include "bashtypes.h"
 #ifndef _MINIX
@@ -94,6 +95,8 @@
 #  include "bashhist.h"
 #endif
 
+#include "variables.h"
+
 extern int posixly_correct;
 extern int breaking, continuing, loop_level;
 extern int expand_aliases;
@@ -118,6 +121,7 @@
 static void bind_lastarg __P((char *));
 static int shell_control_structure __P((enum command_type));
 static void cleanup_redirects __P((REDIRECT *));
+static int execute_cond_regexp __P((COND_COM *));
 
 #if defined (JOB_CONTROL)
 static int restore_signal_mask __P((sigset_t *));
@@ -2295,6 +2299,12 @@
     }
   else if (cond->type == COND_BINARY)
     {
+      if ((cond->op->word[0] == '=') && (cond->op->word[1] == '~'))
+        {
+         result = execute_cond_regexp(cond);
+       }
+      else
+       {
       patmatch = ((cond->op->word[1] == '=') && (cond->op->word[2] == '\0') &&
                  (cond->op->word[0] == '!' || cond->op->word[0] == '=') ||
                  (cond->op->word[0] == '=' && cond->op->word[1] == '\0'));
@@ -2317,6 +2327,7 @@
       if (arg2 != nullstr)
        free (arg2);
     }
+    }
   else
     {
       command_error ("execute_cond_node", CMDERR_BADTYPE, cond->type, 0);
@@ -2330,6 +2341,96 @@
   return result;
 }
 
+/* Execute the '=~' regular expression matching operator.  The LHS of the =~
+   operator is the string to match and the RHS is the regular expression. */
+static int
+execute_cond_regexp(COND_COM * cond)
+{
+  extern int glob_ignore_case;
+  regex_t regex = { 0 };;
+  regmatch_t * matches = NULL;
+  int flags = REG_EXTENDED;    /* Enable extended regular expressions. */
+  SHELL_VAR * resubexp = NULL;
+  ARRAY * array = NULL;
+  int subexp_index = 0;
+  char * subexp_string = NULL;
+  size_t subexp_string_len = 0;
+  char * arg1 = NULL, * arg2 = NULL;
+  int result = 1;
+
+  arg1 = cond_expand_word (cond->left->op, 0);
+
+  if (arg1 == 0)
+    arg1 = nullstr;
+  else
+    {
+      /* Allocate a temporary buffer large enough to hold any substring of
+        arg1. */
+      subexp_string_len = strlen (arg1) + 10;
+      subexp_string = (char *)malloc (subexp_string_len);
+    }
+
+  arg2 = cond_expand_word (cond->right->op, 0);
+
+  if (arg2 == 0)
+    arg2 = nullstr;
+
+  if (glob_ignore_case)
+    flags |= REG_ICASE;
+
+  if (regcomp (&regex, arg2, flags))
+    {
+      /* Status 2 indicates a regex compilation failure.  Maybe we should 
output
+        an error message here? */
+      result = 2;
+    }
+  else
+    {
+      /* Allocate the regmatch_t array. */
+      regmatch_t * matches =
+       (regmatch_t *)malloc (sizeof(regmatch_t) * (regex.re_nsub + 1));
+
+      if (regexec (&regex, arg1, regex.re_nsub + 1, matches, 0))
+       result = EXECUTION_FAILURE;     /* No match. */
+      else
+       result = EXECUTION_SUCCESS;     /* Match. */
+
+      /* Store the parenthesized subexpressions in the array
+         RESUBEXP.  Element 0 is the part that matched the entire
+         regexp, element 1 is the part that matched the first
+         subexpression, etc. */
+      unbind_variable ("RESUBEXP");
+      resubexp = make_new_array_variable ("RESUBEXP");
+      array = array_cell (resubexp);
+
+      if (result == EXECUTION_SUCCESS && subexp_string != NULL)
+       {
+         for (subexp_index = 0; subexp_index <= regex.re_nsub; ++subexp_index)
+           {
+             memset (subexp_string, 0, subexp_string_len);
+             strncpy (subexp_string, arg1 + matches[subexp_index].rm_so,
+                      matches[subexp_index].rm_eo - 
matches[subexp_index].rm_so);
+             array_insert (array, subexp_index, subexp_string);
+           }
+
+         free (subexp_string);
+       }
+
+      VSETATTR (resubexp, att_readonly);
+
+      free (matches);
+    }
+
+  regfree (&regex);
+
+  if (arg1 != nullstr)
+    free (arg1);
+  if (arg2 != nullstr)
+    free (arg2);
+
+  return result;
+}
+
 static int
 execute_cond_command (cond_command)
      COND_COM *cond_command;
diff -ruw bash-2.05b/parse.y bash-2.05b-franl/parse.y
--- bash-2.05b/parse.y  2002-05-21 11:57:30.000000000 -0400
+++ bash-2.05b-franl/parse.y    2003-07-17 01:03:46.000000000 -0400
@@ -296,7 +296,7 @@
 %token <number> NUMBER
 %token <word_list> ARITH_CMD ARITH_FOR_EXPRS
 %token <command> COND_CMD
-%token AND_AND OR_OR GREATER_GREATER LESS_LESS LESS_AND LESS_LESS_LESS
+%token AND_AND OR_OR GREATER_GREATER LESS_LESS LESS_AND LESS_LESS_LESS 
REGEXP_BIND
 %token GREATER_AND SEMI_SEMI LESS_LESS_MINUS AND_GREATER LESS_GREATER
 %token GREATER_BAR
 
@@ -1675,6 +1675,7 @@
   { "&>", AND_GREATER },
   { "<>", LESS_GREATER },
   { ">|", GREATER_BAR },
+  { "=~", REGEXP_BIND },
   { "EOF", yacc_EOF },
   /* Tokens whose value is the character itself */
   { ">", '>' },
@@ -2442,6 +2443,16 @@
       return (character);
     }
 
+  if ((parser_state & PST_CONDEXPR) != 0)
+  {
+      peek_char = shell_getc (1);
+      if (MBTEST(character == '=' && peek_char == '~'))
+      {
+         return (REGEXP_BIND);
+      }
+      shell_ungetc (peek_char);
+  }
+
   /* Shell meta-characters. */
   if MBTEST(shellmeta (character) && ((parser_state & PST_DBLPAREN) == 0))
     {
@@ -3005,6 +3016,8 @@
       tok = read_token (READ);
       if (tok == WORD && test_binop (yylval.word->word))
        op = yylval.word;
+      else if (tok == REGEXP_BIND)
+         op = make_word ("=~");
       else if (tok == '<' || tok == '>')
        op = make_word_from_token (tok);  /* ( */
       /* There should be a check before blindly accepting the `)' that we have





reply via email to

[Prev in Thread] Current Thread [Next in Thread]