bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] New function fstrcmp_if_higher.


From: Ralf Wildenhues
Subject: [PATCH] New function fstrcmp_if_higher.
Date: Sun, 14 Sep 2008 10:14:12 +0200
User-agent: Mutt/1.5.18 (2008-05-17)

* lib/fstrcmp.c (fstrcmp_internal): Static rename from fstrcmp,
also take string lengths as arguments.
(fstrcmp): New, thin public wrapper around fstrcmp_internal.
(fstrcmp_upper_bound): New function, for fast fstrcmp estimate.
(fstrcmp_if_higher): New function, to compute fstrcmp value only
if it exceeds some bound.
* lib/fstrcmp.h: Declare it.
* tests/test-fstrcmp.c (check_fstrcmp): Test it.
---

This implements (1), the fast upper bound check with string lengths.
I assumed that it may be useful to keep the current fstrmp API
unchanged, for other potential users(?).

OK to apply?

Thanks,
Ralf

 lib/fstrcmp.c        |   86 +++++++++++++++++++++++++++++++++++--------------
 lib/fstrcmp.h        |    8 ++++-
 tests/test-fstrcmp.c |    7 +++-
 3 files changed, 74 insertions(+), 27 deletions(-)

diff --git a/lib/fstrcmp.c b/lib/fstrcmp.c
index 70e6818..5ab6136 100644
--- a/lib/fstrcmp.c
+++ b/lib/fstrcmp.c
@@ -97,29 +97,13 @@ keys_init (void)
 gl_once_define(static, keys_init_once)
 
 
-/* NAME
-       fstrcmp - fuzzy string compare
-
-   SYNOPSIS
-       double fstrcmp(const char *, const char *);
-
-   DESCRIPTION
-       The fstrcmp function may be used to compare two string for
-       similarity.  It is very useful in reducing "cascade" or
-       "secondary" errors in compilers or other situations where
-       symbol tables occur.
-
-   RETURNS
-       double; 0 if the strings are entirly dissimilar, 1 if the
-       strings are identical, and a number in between if they are
-       similar.  */
-
-double
-fstrcmp (const char *string1, const char *string2)
+/* Compute and return the fstrcmp similarity value of STRING1 and STRING2
+   with lengths XVEC_LENGTH and YVEC_LENGTH.  */
+static double
+fstrcmp_internal (const char *string1, const char *string2,
+                 int xvec_length, int yvec_length)
 {
   struct context ctxt;
-  int xvec_length;
-  int yvec_length;
   int i;
 
   size_t fdiag_len;
@@ -128,9 +112,7 @@ fstrcmp (const char *string1, const char *string2)
 
   /* set the info for each string.  */
   ctxt.xvec = string1;
-  xvec_length = strlen (string1);
   ctxt.yvec = string2;
-  yvec_length = strlen (string2);
 
   /* short-circuit obvious comparisons */
   if (xvec_length == 0 && yvec_length == 0)
@@ -173,8 +155,7 @@ fstrcmp (const char *string1, const char *string2)
   /* Now do the main comparison algorithm */
   ctxt.xvec_edit_count = 0;
   ctxt.yvec_edit_count = 0;
-  compareseq (0, xvec_length, 0, yvec_length, 0,
-             &ctxt);
+  compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt);
 
   /* The result is
        ((number of chars in common) / (average length of the strings)).
@@ -184,3 +165,58 @@ fstrcmp (const char *string1, const char *string2)
                    - ctxt.yvec_edit_count - ctxt.xvec_edit_count)
          / (xvec_length + yvec_length));
 }
+
+
+/* NAME
+       fstrcmp - fuzzy string compare
+
+   SYNOPSIS
+       double fstrcmp(const char *, const char *);
+
+   DESCRIPTION
+       The fstrcmp function may be used to compare two string for
+       similarity.  It is very useful in reducing "cascade" or
+       "secondary" errors in compilers or other situations where
+       symbol tables occur.
+
+   RETURNS
+       double; 0 if the strings are entirly dissimilar, 1 if the
+       strings are identical, and a number in between if they are
+       similar.  */
+
+double
+fstrcmp (const char *string1, const char *string2)
+{
+  int xvec_length = strlen (string1);
+  int yvec_length = strlen (string2);
+  return fstrcmp_internal (string1, string2, xvec_length, yvec_length);
+}
+
+/* Upper bound for the fuzzy comparison of strings of length XVEC_LENGTH and
+   YVEC_LENGTH using fstrcmp.  The difference of the string lengths is a lower
+   bound for the Levenshtein distance, which translates into a lower bound for
+   the sum of the edit counts as computed by compareseq.  This translates into
+   an upper bound for the similarity value returned by fstrcmp.  */
+static double
+fstrcmp_upper_bound (int xvec_length, int yvec_length)
+{
+  return 2. * MIN(xvec_length, yvec_length) / (xvec_length + yvec_length);
+}
+
+/* Compute and return the fstrcmp similarity value of STRING1 and STRING2
+   if it is higher than BOUND.  Otherwise, return 0.  */
+double
+fstrcmp_if_higher (const char *string1, const char *string2, double bound)
+{
+  int xvec_length = strlen (string1);
+  int yvec_length = strlen (string2);
+  volatile double estimate = fstrcmp_upper_bound (xvec_length, yvec_length);
+  volatile double value;
+
+  if (estimate < bound)
+    return 0.;
+  value = fstrcmp_internal (string1, string2, xvec_length, yvec_length);
+  if (value < bound)
+    return 0.;
+  return value;
+}
diff --git a/lib/fstrcmp.h b/lib/fstrcmp.h
index 76fa393..1c1c538 100644
--- a/lib/fstrcmp.h
+++ b/lib/fstrcmp.h
@@ -1,5 +1,6 @@
 /* Fuzzy string comparison.
-   Copyright (C) 1995, 2000, 2002-2003, 2006 Free Software Foundation, Inc.
+   Copyright (C) 1995, 2000, 2002-2003, 2006, 2008 Free Software
+   Foundation, Inc.
 
    This file was written by Peter Miller <address@hidden>
 
@@ -27,6 +28,11 @@ extern "C" {
    and S1.  The higher the result, the more similar the strings are.  */
 extern double fstrcmp (const char *s1, const char *s2);
 
+/* Compute and return the fstrcmp similarity value of STRING1 and STRING2
+   if it is higher than BOUND.  Otherwise, return 0.  */
+extern double fstrcmp_if_higher (const char *string1, const char *string2,
+                                double bound);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/tests/test-fstrcmp.c b/tests/test-fstrcmp.c
index 0e17ff5..8b95cb8 100644
--- a/tests/test-fstrcmp.c
+++ b/tests/test-fstrcmp.c
@@ -46,7 +46,12 @@ check_fstrcmp (const char *string1, const char *string2, 
double expected)
      compiler option dependent.  'volatile' is a portable alternative to gcc's
      -ffloat-store option.  */
   volatile double result = fstrcmp (string1, string2);
-  return (result == expected);
+  volatile double result2 = fstrcmp_if_higher (string1, string2, expected);
+  volatile double result3 = fstrcmp_if_higher (string1, string2, (1. + 
expected) / 2.);
+
+  return (result == expected
+          && result2 == expected
+          && (expected == 1.0 || result3 == 0.));
 }
 
 int
-- 
1.6.0.1.286.g599f2






reply via email to

[Prev in Thread] Current Thread [Next in Thread]