bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] Implement premature termination of compareseq.


From: Ralf Wildenhues
Subject: [PATCH] Implement premature termination of compareseq.
Date: Sun, 14 Sep 2008 10:15:26 +0200
User-agent: Mutt/1.5.18 (2008-05-17)

* lib/diffseq.h (CHECK_CONDITION): New required define; may be
empty.
(compareseq): Use it after each note, and recursive compareseq
invocation.
* lib/fstrcmp.c (EXTRA_CONTEXT_FIELDS): New field edit_count_limit.
(CHECK_CONDITION): New define; check if the bound has been reached.
(fstrcmp_internal): New argument 'bound', used to compute
edit_count_limit.  Return zero  if the limit has been reached.
(fstrcmp): Adjust caller, using zero bound.
(fstrcmp_if_higher): Likewise, using given bound.
---

This patch implements (2), premature termination of the compareseq
code.  This patch requires diffutils to add an empty CHECK_CONDITION
define, but otherwise has no impact on its use of diffseq.h.  The
patch relies on the previous posted patch.

I should note that the computation of edit_count_limit adds
1 (not 0.5) to the computation to avoid issues with strings
that have very similar fstrcmp values.

This change may slow down fstrcmp a bit (I haven't measured),
but it's clearly not an algorithmic decay, and the additional
tests are outside of the hot inner loop (diag).

OK to apply to gnulib?

Thanks,
Ralf

 lib/diffseq.h |    9 ++++++++-
 lib/fstrcmp.c |   32 ++++++++++++++++++++------------
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/lib/diffseq.h b/lib/diffseq.h
index 7a62196..93091fd 100644
--- a/lib/diffseq.h
+++ b/lib/diffseq.h
@@ -49,6 +49,7 @@
      EXTRA_CONTEXT_FIELDS    Declarations of fields for 'struct context'.
      NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
      NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
+     CHECK_CONDITION         Check for premature termination.
      USE_HEURISTIC           (Optional) Define if you want to support the
                              heuristic for large vectors.
    Before including this file, you also need to include:
@@ -407,7 +408,9 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, 
bool find_minimal,
    If FIND_MINIMAL, find a minimal difference no matter how
    expensive it is.
 
-   The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.  */
+   The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
+   The algorithm may be terminated prematurely using CHECK_CONDITION.
+ */
 
 static void
 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
@@ -435,12 +438,14 @@ compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET 
ylim,
     while (yoff < ylim)
       {
        NOTE_INSERT (ctxt, yoff);
+       CHECK_CONDITION (ctxt);
        yoff++;
       }
   else if (yoff == ylim)
     while (xoff < xlim)
       {
        NOTE_DELETE (ctxt, xoff);
+       CHECK_CONDITION (ctxt);
        xoff++;
       }
   else
@@ -452,6 +457,7 @@ compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET 
ylim,
 
       /* Use the partitions to split this problem into subproblems.  */
       compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt);
+      CHECK_CONDITION (ctxt);
       compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt);
     }
 }
@@ -462,5 +468,6 @@ compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET 
ylim,
 #undef EXTRA_CONTEXT_FIELDS
 #undef NOTE_DELETE
 #undef NOTE_INSERT
+#undef CHECK_CONDITION
 #undef USE_HEURISTIC
 #undef OFFSET_MAX
diff --git a/lib/fstrcmp.c b/lib/fstrcmp.c
index 5ab6136..efb7331 100644
--- a/lib/fstrcmp.c
+++ b/lib/fstrcmp.c
@@ -67,9 +67,13 @@
 #define EXTRA_CONTEXT_FIELDS \
   /* The number of elements inserted or deleted. */ \
   int xvec_edit_count; \
-  int yvec_edit_count;
+  int yvec_edit_count; \
+  int edit_count_limit;
 #define NOTE_DELETE(ctxt, xoff) ctxt->xvec_edit_count++
 #define NOTE_INSERT(ctxt, yoff) ctxt->yvec_edit_count++
+#define CHECK_CONDITION(ctxt) \
+  if (ctxt->xvec_edit_count + ctxt->yvec_edit_count >= ctxt->edit_count_limit) 
\
+    return
 /* We don't need USE_HEURISTIC, since it is unlikely in typical uses of
    fstrcmp().  */
 #include "diffseq.h"
@@ -98,10 +102,10 @@ gl_once_define(static, keys_init_once)
 
 
 /* Compute and return the fstrcmp similarity value of STRING1 and STRING2
-   with lengths XVEC_LENGTH and YVEC_LENGTH.  */
+   if it is higher than BOUND.  Otherwise, return 0.  */
 static double
 fstrcmp_internal (const char *string1, const char *string2,
-                 int xvec_length, int yvec_length)
+                 int xvec_length, int yvec_length, double bound)
 {
   struct context ctxt;
   int i;
@@ -155,15 +159,19 @@ fstrcmp_internal (const char *string1, const char 
*string2,
   /* Now do the main comparison algorithm */
   ctxt.xvec_edit_count = 0;
   ctxt.yvec_edit_count = 0;
+  ctxt.edit_count_limit = (1. - bound) * (xvec_length + yvec_length) + 1;
   compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt);
 
-  /* The result is
-       ((number of chars in common) / (average length of the strings)).
-     This is admittedly biased towards finding that the strings are
-     similar, however it does produce meaningful results.  */
-  return ((double) (xvec_length + yvec_length
-                   - ctxt.yvec_edit_count - ctxt.xvec_edit_count)
-         / (xvec_length + yvec_length));
+  if (ctxt.xvec_edit_count + ctxt.yvec_edit_count >= ctxt.edit_count_limit)
+    return 0.;
+  else
+    /* The result is
+         ((number of chars in common) / (average length of the strings)).
+       This is admittedly biased towards finding that the strings are
+       similar, however it does produce meaningful results.  */
+    return ((double) (xvec_length + yvec_length
+                     - ctxt.yvec_edit_count - ctxt.xvec_edit_count)
+           / (xvec_length + yvec_length));
 }
 
 
@@ -189,7 +197,7 @@ fstrcmp (const char *string1, const char *string2)
 {
   int xvec_length = strlen (string1);
   int yvec_length = strlen (string2);
-  return fstrcmp_internal (string1, string2, xvec_length, yvec_length);
+  return fstrcmp_internal (string1, string2, xvec_length, yvec_length, 0.);
 }
 
 /* Upper bound for the fuzzy comparison of strings of length XVEC_LENGTH and
@@ -215,7 +223,7 @@ fstrcmp_if_higher (const char *string1, const char 
*string2, double bound)
 
   if (estimate < bound)
     return 0.;
-  value = fstrcmp_internal (string1, string2, xvec_length, yvec_length);
+  value = fstrcmp_internal (string1, string2, xvec_length, yvec_length, bound);
   if (value < bound)
     return 0.;
   return value;
-- 
1.6.0.1.286.g599f2






reply via email to

[Prev in Thread] Current Thread [Next in Thread]