bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

make strstr multibyte-safe


From: Bruno Haible
Subject: make strstr multibyte-safe
Date: Wed, 17 Aug 2005 17:35:22 +0200
User-agent: KMail/1.5

Hi,

After strcasecmp(), also strstr() can be made to work in multibyte locales.
I committed the appended patch. (Another possible implementation would
have been to call locale_charset() and compare its result to "BIG5", "GBK"
and a few others, to exploit the fact that an UTF-8 string can use the
bytewise search function.)

2005-08-17  Bruno Haible  <address@hidden>

        * modules/strstr (Files): Add m4/mbrtowc.m4.
        (Depends-on): Add mbuiter.
        * lib/strstr.h: Ignore HAVE_STRSTR, always declare the gnulib function.
        * lib/strstr.c: Completely rewritten, with multibyte locale support.
        * m4/strstr.m4 (gl_FUNC_STRSTR): Use the replacement function always.
        (gl_PREREQ_STRSTR): Use gl_FUNC_MBRTOWC.

Index: modules/strstr
===================================================================
RCS file: /cvsroot/gnulib/gnulib/modules/strstr,v
retrieving revision 1.5
diff -c -3 -r1.5 strstr
*** modules/strstr      22 Sep 2004 15:11:04 -0000      1.5
--- modules/strstr      17 Aug 2005 14:01:49 -0000
***************
*** 5,12 ****
--- 5,14 ----
  lib/strstr.h
  lib/strstr.c
  m4/strstr.m4
+ m4/mbrtowc.m4
  
  Depends-on:
+ mbuiter
  
  configure.ac:
  gl_FUNC_STRSTR
Index: lib/strstr.h
===================================================================
RCS file: /cvsroot/gnulib/gnulib/lib/strstr.h,v
retrieving revision 1.4
diff -c -3 -r1.4 strstr.h
*** lib/strstr.h        14 May 2005 06:03:58 -0000      1.4
--- lib/strstr.h        17 Aug 2005 14:01:49 -0000
***************
*** 1,5 ****
  /* Searching in a string.
!    Copyright (C) 2001-2003 Free Software Foundation, Inc.
  
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
--- 1,5 ----
  /* Searching in a string.
!    Copyright (C) 2001-2003, 2005 Free Software Foundation, Inc.
  
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
***************
*** 15,27 ****
     along with this program; if not, write to the Free Software Foundation,
     Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  
- #if HAVE_STRSTR
- 
- /* Get strstr() declaration.  */
- #include <string.h>
- 
- #else
- 
  #ifdef __cplusplus
  extern "C" {
  #endif
--- 15,20 ----
***************
*** 31,36 ****
  
  #ifdef __cplusplus
  }
- #endif
- 
  #endif
--- 24,27 ----
Index: lib/strstr.c
===================================================================
RCS file: /cvsroot/gnulib/gnulib/lib/strstr.c,v
retrieving revision 1.11
diff -c -3 -r1.11 strstr.c
*** lib/strstr.c        14 May 2005 06:03:58 -0000      1.11
--- lib/strstr.c        17 Aug 2005 14:01:49 -0000
***************
*** 1,119 ****
! /* Copyright (C) 1994, 1999, 2002-2003 Free Software Foundation, Inc.
! This file is part of the GNU C Library.
! 
! This program is free software; you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation; either version 2, or (at your option)
! any later version.
! 
! This program is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU General Public License for more details.
! 
! You should have received a copy of the GNU General Public License
! along with this program; if not, write to the Free Software
! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, 
USA.  */
! 
! /*
!  * My personal strstr() implementation that beats most other algorithms.
!  * Until someone tells me otherwise, I assume that this is the
!  * fastest implementation of strstr() in C.
!  * I deliberately chose not to comment it.  You should have at least
!  * as much fun trying to understand it, as I had to write it :-).
!  *
!  * Stephen R. van den Berg, address@hidden    */
  
  #if HAVE_CONFIG_H
  # include <config.h>
  #endif
  
! #include <string.h>
! 
! typedef unsigned chartype;
  
! #undef strstr
  
  char *
! strstr (const char *phaystack, const char *pneedle)
  {
!   register const unsigned char *haystack, *needle;
!   register chartype b, c;
  
!   haystack = (const unsigned char *) phaystack;
!   needle = (const unsigned char *) pneedle;
  
!   b = *needle;
!   if (b != '\0')
      {
!       haystack--;                             /* possible ANSI violation */
!       do
        {
!         c = *++haystack;
!         if (c == '\0')
!           goto ret0;
!       }
!       while (c != b);
! 
!       c = *++needle;
!       if (c == '\0')
!       goto foundneedle;
!       ++needle;
!       goto jin;
! 
!       for (;;)
!         {
!           register chartype a;
!         register const unsigned char *rhaystack, *rneedle;
  
!         do
            {
!             a = *++haystack;
!             if (a == '\0')
!               goto ret0;
!             if (a == b)
!               break;
!             a = *++haystack;
!             if (a == '\0')
!               goto ret0;
! shloop:;    }
!           while (a != b);
! 
! jin:    a = *++haystack;
!         if (a == '\0')
!           goto ret0;
! 
!         if (a != c)
!           goto shloop;
! 
!         rhaystack = haystack-- + 1;
!         rneedle = needle;
!         a = *rneedle;
! 
!         if (*rhaystack == a)
!           do
!             {
!               if (a == '\0')
!                 goto foundneedle;
!               ++rhaystack;
!               a = *++needle;
!               if (*rhaystack != a)
!                 break;
!               if (a == '\0')
!                 goto foundneedle;
!               ++rhaystack;
!               a = *++needle;
!             }
!           while (*rhaystack == a);
! 
!         needle = rneedle;                /* took the register-poor approach */
! 
!         if (a == '\0')
!           break;
!         }
      }
- foundneedle:
-   return (char*) haystack;
- ret0:
-   return 0;
  }
--- 1,126 ----
! /* Searching in a string.
!    Copyright (C) 2005 Free Software Foundation, Inc.
!    Written by Bruno Haible <address@hidden>, 2005.
! 
!    This program is free software; you can redistribute it and/or modify
!    it under the terms of the GNU General Public License as published by
!    the Free Software Foundation; either version 2, or (at your option)
!    any later version.
! 
!    This program is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU General Public License for more details.
! 
!    You should have received a copy of the GNU General Public License
!    along with this program; if not, write to the Free Software Foundation,
!    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  
  #if HAVE_CONFIG_H
  # include <config.h>
  #endif
  
! /* Specification.  */
! #include "strstr.h"
  
! #if HAVE_MBRTOWC
! # include "mbuiter.h"
! #endif
  
+ /* Find the first occurrence of NEEDLE in HAYSTACK.  */
  char *
! strstr (const char *haystack, const char *needle)
  {
!   /* Be careful not to look at the entire extent of haystack or needle
!      until needed.  This is useful because of these two cases:
!        - haystack may be very long, and a match of needle found early,
!        - needle may be very long, and not even a short initial segment of
!          needle may be found in haystack.  */
! #if HAVE_MBRTOWC
!   if (MB_CUR_MAX > 1)
!     {
!       mbui_iterator_t iter_needle;
  
!       mbui_init (iter_needle, needle);
!       if (mbui_avail (iter_needle))
!       {
!         mbui_iterator_t iter_haystack;
  
!         mbui_init (iter_haystack, haystack);
!         for (;; mbui_advance (iter_haystack))
!           {
!             if (!mbui_avail (iter_haystack))
!               /* No match.  */
!               return NULL;
! 
!             if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle)))
!               /* The first character matches.  */
!               {
!                 mbui_iterator_t rhaystack;
!                 mbui_iterator_t rneedle;
! 
!                 memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
!                 mbui_advance (rhaystack);
! 
!                 mbui_init (rneedle, needle);
!                 if (!mbui_avail (rneedle))
!                   abort ();
!                 mbui_advance (rneedle);
! 
!                 for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
!                   {
!                     if (!mbui_avail (rneedle))
!                       /* Found a match.  */
!                       return (char *) haystack;
!                     if (!mbui_avail (rhaystack))
!                       /* No match.  */
!                       return NULL;
!                     if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle)))
!                       /* Nothing in this round.  */
!                       break;
!                   }
!               }
!           }
!       }
!       else
!       return (char *) haystack;
!     }
!   else
! #endif
      {
!       if (*needle != '\0')
        {
!         /* Speed up the following searches of needle by caching its first
!            character.  */
!         char b = *needle++;
  
!         for (;; haystack++)
            {
!             if (*haystack == '\0')
!               /* No match.  */
!               return NULL;
!             if (*haystack == b)
!               /* The first character matches.  */
!               {
!                 const char *rhaystack = haystack + 1;
!                 const char *rneedle = needle;
! 
!                 for (;; rhaystack++, rneedle++)
!                   {
!                     if (*rneedle == '\0')
!                       /* Found a match.  */
!                       return (char *) haystack;
!                     if (*rhaystack == '\0')
!                       /* No match.  */
!                       return NULL;
!                     if (*rhaystack != *rneedle)
!                       /* Nothing in this round.  */
!                       break;
!                   }
!               }
!           }
!       }
!       else
!       return (char *) haystack;
      }
  }
Index: m4/strstr.m4
===================================================================
RCS file: /cvsroot/gnulib/gnulib/m4/strstr.m4,v
retrieving revision 1.3
diff -c -3 -r1.3 strstr.m4
*** m4/strstr.m4        18 Jan 2005 13:07:56 -0000      1.3
--- m4/strstr.m4        17 Aug 2005 14:01:49 -0000
***************
*** 1,16 ****
! # strstr.m4 serial 2
! dnl Copyright (C) 2002-2003 Free Software Foundation, Inc.
  dnl This file is free software; the Free Software Foundation
  dnl gives unlimited permission to copy and/or distribute it,
  dnl with or without modifications, as long as this notice is preserved.
  
  AC_DEFUN([gl_FUNC_STRSTR],
  [
!   AC_REPLACE_FUNCS(strstr)
!   if test $ac_cv_func_strstr = no; then
!     gl_PREREQ_STRSTR
!   fi
  ])
  
  # Prerequisites of lib/strstr.c.
! AC_DEFUN([gl_PREREQ_STRSTR], [:])
--- 1,19 ----
! # strstr.m4 serial 3
! dnl Copyright (C) 2002-2003, 2005 Free Software Foundation, Inc.
  dnl This file is free software; the Free Software Foundation
  dnl gives unlimited permission to copy and/or distribute it,
  dnl with or without modifications, as long as this notice is preserved.
  
  AC_DEFUN([gl_FUNC_STRSTR],
  [
!   dnl No known system has a strstr() function that works correctly in
!   dnl multibyte locales. Therefore we use our version always.
!   AC_LIBOBJ(strstr)
!   AC_DEFINE(strstr, rpl_strstr, [Define to rpl_strstr always.])
!   gl_PREREQ_STRSTR
  ])
  
  # Prerequisites of lib/strstr.c.
! AC_DEFUN([gl_PREREQ_STRSTR], [
!   gl_FUNC_MBRTOWC
! ])





reply via email to

[Prev in Thread] Current Thread [Next in Thread]