[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH] New function fstrcmp_if_higher.
From: |
Ralf Wildenhues |
Subject: |
[PATCH] New function fstrcmp_if_higher. |
Date: |
Sun, 14 Sep 2008 10:14:12 +0200 |
User-agent: |
Mutt/1.5.18 (2008-05-17) |
* lib/fstrcmp.c (fstrcmp_internal): Static rename from fstrcmp,
also take string lengths as arguments.
(fstrcmp): New, thin public wrapper around fstrcmp_internal.
(fstrcmp_upper_bound): New function, for fast fstrcmp estimate.
(fstrcmp_if_higher): New function, to compute fstrcmp value only
if it exceeds some bound.
* lib/fstrcmp.h: Declare it.
* tests/test-fstrcmp.c (check_fstrcmp): Test it.
---
This implements (1), the fast upper bound check with string lengths.
I assumed that it may be useful to keep the current fstrmp API
unchanged, for other potential users(?).
OK to apply?
Thanks,
Ralf
lib/fstrcmp.c | 86 +++++++++++++++++++++++++++++++++++--------------
lib/fstrcmp.h | 8 ++++-
tests/test-fstrcmp.c | 7 +++-
3 files changed, 74 insertions(+), 27 deletions(-)
diff --git a/lib/fstrcmp.c b/lib/fstrcmp.c
index 70e6818..5ab6136 100644
--- a/lib/fstrcmp.c
+++ b/lib/fstrcmp.c
@@ -97,29 +97,13 @@ keys_init (void)
gl_once_define(static, keys_init_once)
-/* NAME
- fstrcmp - fuzzy string compare
-
- SYNOPSIS
- double fstrcmp(const char *, const char *);
-
- DESCRIPTION
- The fstrcmp function may be used to compare two string for
- similarity. It is very useful in reducing "cascade" or
- "secondary" errors in compilers or other situations where
- symbol tables occur.
-
- RETURNS
- double; 0 if the strings are entirly dissimilar, 1 if the
- strings are identical, and a number in between if they are
- similar. */
-
-double
-fstrcmp (const char *string1, const char *string2)
+/* Compute and return the fstrcmp similarity value of STRING1 and STRING2
+ with lengths XVEC_LENGTH and YVEC_LENGTH. */
+static double
+fstrcmp_internal (const char *string1, const char *string2,
+ int xvec_length, int yvec_length)
{
struct context ctxt;
- int xvec_length;
- int yvec_length;
int i;
size_t fdiag_len;
@@ -128,9 +112,7 @@ fstrcmp (const char *string1, const char *string2)
/* set the info for each string. */
ctxt.xvec = string1;
- xvec_length = strlen (string1);
ctxt.yvec = string2;
- yvec_length = strlen (string2);
/* short-circuit obvious comparisons */
if (xvec_length == 0 && yvec_length == 0)
@@ -173,8 +155,7 @@ fstrcmp (const char *string1, const char *string2)
/* Now do the main comparison algorithm */
ctxt.xvec_edit_count = 0;
ctxt.yvec_edit_count = 0;
- compareseq (0, xvec_length, 0, yvec_length, 0,
- &ctxt);
+ compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt);
/* The result is
((number of chars in common) / (average length of the strings)).
@@ -184,3 +165,58 @@ fstrcmp (const char *string1, const char *string2)
- ctxt.yvec_edit_count - ctxt.xvec_edit_count)
/ (xvec_length + yvec_length));
}
+
+
+/* NAME
+ fstrcmp - fuzzy string compare
+
+ SYNOPSIS
+ double fstrcmp(const char *, const char *);
+
+ DESCRIPTION
+ The fstrcmp function may be used to compare two string for
+ similarity. It is very useful in reducing "cascade" or
+ "secondary" errors in compilers or other situations where
+ symbol tables occur.
+
+ RETURNS
+ double; 0 if the strings are entirly dissimilar, 1 if the
+ strings are identical, and a number in between if they are
+ similar. */
+
+double
+fstrcmp (const char *string1, const char *string2)
+{
+ int xvec_length = strlen (string1);
+ int yvec_length = strlen (string2);
+ return fstrcmp_internal (string1, string2, xvec_length, yvec_length);
+}
+
+/* Upper bound for the fuzzy comparison of strings of length XVEC_LENGTH and
+ YVEC_LENGTH using fstrcmp. The difference of the string lengths is a lower
+ bound for the Levenshtein distance, which translates into a lower bound for
+ the sum of the edit counts as computed by compareseq. This translates into
+ an upper bound for the similarity value returned by fstrcmp. */
+static double
+fstrcmp_upper_bound (int xvec_length, int yvec_length)
+{
+ return 2. * MIN(xvec_length, yvec_length) / (xvec_length + yvec_length);
+}
+
+/* Compute and return the fstrcmp similarity value of STRING1 and STRING2
+ if it is higher than BOUND. Otherwise, return 0. */
+double
+fstrcmp_if_higher (const char *string1, const char *string2, double bound)
+{
+ int xvec_length = strlen (string1);
+ int yvec_length = strlen (string2);
+ volatile double estimate = fstrcmp_upper_bound (xvec_length, yvec_length);
+ volatile double value;
+
+ if (estimate < bound)
+ return 0.;
+ value = fstrcmp_internal (string1, string2, xvec_length, yvec_length);
+ if (value < bound)
+ return 0.;
+ return value;
+}
diff --git a/lib/fstrcmp.h b/lib/fstrcmp.h
index 76fa393..1c1c538 100644
--- a/lib/fstrcmp.h
+++ b/lib/fstrcmp.h
@@ -1,5 +1,6 @@
/* Fuzzy string comparison.
- Copyright (C) 1995, 2000, 2002-2003, 2006 Free Software Foundation, Inc.
+ Copyright (C) 1995, 2000, 2002-2003, 2006, 2008 Free Software
+ Foundation, Inc.
This file was written by Peter Miller <address@hidden>
@@ -27,6 +28,11 @@ extern "C" {
and S1. The higher the result, the more similar the strings are. */
extern double fstrcmp (const char *s1, const char *s2);
+/* Compute and return the fstrcmp similarity value of STRING1 and STRING2
+ if it is higher than BOUND. Otherwise, return 0. */
+extern double fstrcmp_if_higher (const char *string1, const char *string2,
+ double bound);
+
#ifdef __cplusplus
}
#endif
diff --git a/tests/test-fstrcmp.c b/tests/test-fstrcmp.c
index 0e17ff5..8b95cb8 100644
--- a/tests/test-fstrcmp.c
+++ b/tests/test-fstrcmp.c
@@ -46,7 +46,12 @@ check_fstrcmp (const char *string1, const char *string2,
double expected)
compiler option dependent. 'volatile' is a portable alternative to gcc's
-ffloat-store option. */
volatile double result = fstrcmp (string1, string2);
- return (result == expected);
+ volatile double result2 = fstrcmp_if_higher (string1, string2, expected);
+ volatile double result3 = fstrcmp_if_higher (string1, string2, (1. +
expected) / 2.);
+
+ return (result == expected
+ && result2 == expected
+ && (expected == 1.0 || result3 == 0.));
}
int
--
1.6.0.1.286.g599f2
- msgmerge speedup: fstrcmp and diffseq improvements, Ralf Wildenhues, 2008/09/14
- [PATCH] New function fstrcmp_if_higher.,
Ralf Wildenhues <=
- [PATCH] Implement premature termination of compareseq., Ralf Wildenhues, 2008/09/14
- Message not available
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Ralf Wildenhues, 2008/09/15
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Bruno Haible, 2008/09/15
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Ralf Wildenhues, 2008/09/19
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Bruno Haible, 2008/09/20
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Ralf Wildenhues, 2008/09/20
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Paolo Bonzini, 2008/09/21