[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: c-strstr: add testcase
From: |
Bruno Haible |
Subject: |
Re: c-strstr: add testcase |
Date: |
Sun, 11 Feb 2007 15:13:54 +0100 |
User-agent: |
KMail/1.5.4 |
Now here's the fix that brings the worst-case complexity down from O(n*m)
to O(n). One could use either the Knuth-Morris-Pratt or the Boyer-Moore
algorithm. I'm using the former one, because Boyer-Moore works "backwards",
which is harder to generalize to the mbsstr case.
2007-02-11 Bruno Haible <address@hidden>
Ensure O(n) worst-case complexity of c_strstr.
* lib/c-strstr.c: Include stdbool.h, string.h.
(knuth_morris_pratt): New function.
(c_strstr): Add some bookkeeping. Invoke knuth_morris_pratt when the
bookkeeping indicates that it's worth it.
* modules/c-strstr (Depends-on): Add stdbool, strnlen.
*** lib/c-strstr.c 11 Feb 2007 13:58:43 -0000 1.4
--- lib/c-strstr.c 11 Feb 2007 14:01:35 -0000
***************
*** 21,27 ****
/* Specification. */
#include "c-strstr.h"
! #include <stddef.h>
/* Find the first occurrence of NEEDLE in HAYSTACK. */
char *
--- 21,117 ----
/* Specification. */
#include "c-strstr.h"
! #include <stdbool.h>
! #include <stdlib.h>
! #include <string.h>
!
! /* Knuth-Morris-Pratt algorithm.
! See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
! Return a boolean indicating success. */
! static bool
! knuth_morris_pratt (const char *haystack, const char *needle,
! const char **resultp)
! {
! size_t m = strlen (needle);
!
! /* Allocate the table. */
! size_t *table = (size_t *) malloc (m * sizeof (size_t));
! if (table == NULL)
! return false;
! /* Fill the table.
! For 0 < i < m:
! 0 < table[i] <= i is defined such that
! rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i]
! implies
! forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1],
! and table[i] is as large as possible with this property.
! table[0] remains uninitialized. */
! {
! size_t i, j;
!
! table[1] = 1;
! j = 0;
! for (i = 2; i < m; i++)
! {
! unsigned char b = (unsigned char) needle[i - 1];
!
! for (;;)
! {
! if (b == (unsigned char) needle[j])
! {
! table[i] = i - ++j;
! break;
! }
! if (j == 0)
! {
! table[i] = i;
! break;
! }
! j = j - table[j];
! }
! }
! }
!
! /* Search, using the table to accelerate the processing. */
! {
! size_t j;
! const char *rhaystack;
! const char *phaystack;
!
! *resultp = NULL;
! j = 0;
! rhaystack = haystack;
! phaystack = haystack;
! /* Invariant: phaystack = rhaystack + j. */
! while (*phaystack != '\0')
! if ((unsigned char) needle[j] == (unsigned char) *phaystack)
! {
! j++;
! phaystack++;
! if (j == m)
! {
! /* The entire needle has been found. */
! *resultp = rhaystack;
! break;
! }
! }
! else if (j > 0)
! {
! /* Found a match of needle[0..j-1], mismatch at needle[j]. */
! rhaystack += table[j];
! j -= table[j];
! }
! else
! {
! /* Found a mismatch at needle[0] already. */
! rhaystack++;
! phaystack++;
! }
! }
!
! free (table);
! return true;
! }
/* Find the first occurrence of NEEDLE in HAYSTACK. */
char *
***************
*** 34,39 ****
--- 124,149 ----
needle may be found in haystack. */
if (*needle != '\0')
{
+ /* Minimizing the worst-case complexity:
+ Let n = strlen(haystack), m = strlen(needle).
+ The naïve algorithm is O(n*m) worst-case.
+ The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
+ memory allocation.
+ To achieve linear complexity and yet amortize the cost of the memory
+ allocation, we activate the Knuth-Morris-Pratt algorithm only once
+ the naïve algorithm has already run for some time; more precisely,
+ when
+ - the outer loop count is >= 10,
+ - the average number of comparisons per outer loop is >= 5,
+ - the total number of comparisons is >= m.
+ But we try it only once. If the memory allocation attempt failed,
+ we don't retry it. */
+ bool try_kmp = true;
+ size_t outer_loop_count = 0;
+ size_t comparison_count = 0;
+ size_t last_ccount = 0; /* last comparison count */
+ const char *needle_last_ccount = needle; /* = needle +
last_ccount */
+
/* Speed up the following searches of needle by caching its first
character. */
unsigned char b = (unsigned char) *needle;
***************
*** 44,49 ****
--- 154,190 ----
if (*haystack == '\0')
/* No match. */
return NULL;
+
+ /* See whether it's advisable to use an asymptotically faster
+ algorithm. */
+ if (try_kmp
+ && outer_loop_count >= 10
+ && comparison_count >= 5 * outer_loop_count)
+ {
+ /* See if needle + comparison_count now reaches the end of
+ needle. */
+ if (needle_last_ccount != NULL)
+ {
+ needle_last_ccount +=
+ strnlen (needle_last_ccount, comparison_count -
last_ccount);
+ if (*needle_last_ccount == '\0')
+ needle_last_ccount = NULL;
+ last_ccount = comparison_count;
+ }
+ if (needle_last_ccount == NULL)
+ {
+ /* Try the Knuth-Morris-Pratt algorithm. */
+ const char *result;
+ bool success =
+ knuth_morris_pratt (haystack, needle - 1, &result);
+ if (success)
+ return (char *) result;
+ try_kmp = false;
+ }
+ }
+
+ outer_loop_count++;
+ comparison_count++;
if ((unsigned char) *haystack == b)
/* The first character matches. */
{
***************
*** 58,63 ****
--- 199,205 ----
if (*rhaystack == '\0')
/* No match. */
return NULL;
+ comparison_count++;
if ((unsigned char) *rhaystack != (unsigned char) *rneedle)
/* Nothing in this round. */
break;
*** modules/c-strstr 28 Aug 2006 13:04:33 -0000 1.1
--- modules/c-strstr 11 Feb 2007 14:01:35 -0000
***************
*** 6,11 ****
--- 6,13 ----
lib/c-strstr.c
Depends-on:
+ stdbool
+ strnlen
configure.ac: