bug-datamash
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] Add geomean stats operation for computing geometric mean


From: Shawn
Subject: [PATCH] Add geomean stats operation for computing geometric mean
Date: Wed, 04 Mar 2020 09:59:03 -0000

See subject. Pretty straightforward and hopefully useful addition to
the statistics functions.

---
 Makefile.am             |   1 +
 bootstrap.conf          |   1 +
 doc/datamash.texi       |   2 +
 m4/.gitignore           |   1 +
 m4/logl.m4              | 200 ++++++++++++++++++++++++++++++++++++++++
 src/datamash.c          |   2 +-
 src/field-ops.c         |   8 ++
 src/field-ops.h         |   3 +-
 src/op-defs.c           |   1 +
 src/op-defs.h           |   1 +
 src/utils.c             |  12 +++
 src/utils.h             |   4 +
 tests/datamash-stats.pl |  16 ++++
 13 files changed, 250 insertions(+), 2 deletions(-)
 create mode 100644 m4/logl.m4

diff --git a/Makefile.am b/Makefile.am
index e80d9cc..fa5f5e3 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -58,6 +58,7 @@ datamash_LDADD = \
        $(LIBICONV) \
        $(LIBINTL) \
        $(LIB_CRYPTO) \
+       $(LOGL_LIBM) \
        $(ROUND_LIBM) \
        $(ROUNDL_LIBM) \
        $(SQRT_LIBM) \
diff --git a/bootstrap.conf b/bootstrap.conf
index e0d892c..e416662 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -62,6 +62,7 @@ gnulib_modules="
     linebuffer
     locale
     localeconv
+    logl
     maintainer-makefile
     mbsrtowcs
     minmax
diff --git a/doc/datamash.texi b/doc/datamash.texi
index 4d87e18..9574b2d 100644
--- a/doc/datamash.texi
+++ b/doc/datamash.texi
@@ -448,6 +448,8 @@ number of unique/distinct values
 @table @option
 @item mean
 mean of the values
+@item geomean
+geometric mean of the values (should be greater than 0)
 @item trimmean
 trimmed mean of the values
 @item median
diff --git a/m4/.gitignore b/m4/.gitignore
index 02e4cef..eef4243 100644
--- a/m4/.gitignore
+++ b/m4/.gitignore
@@ -186,3 +186,4 @@
 /iswxdigit.m4
 /setlocale_null.m4
 /zzgnulib.m4
+/log.m4
diff --git a/m4/logl.m4 b/m4/logl.m4
new file mode 100644
index 0000000..7401da7
--- /dev/null
+++ b/m4/logl.m4
@@ -0,0 +1,200 @@
+# logl.m4 serial 13
+dnl Copyright (C) 2010-2020 Free Software Foundation, Inc.
+dnl This file is free software; the Free Software Foundation
+dnl gives unlimited permission to copy and/or distribute it,
+dnl with or without modifications, as long as this notice is preserved.
+
+AC_DEFUN([gl_FUNC_LOGL],
+[
+  AC_REQUIRE([gl_MATH_H_DEFAULTS])
+  AC_REQUIRE([gl_LONG_DOUBLE_VS_DOUBLE])
+
+  dnl Persuade glibc <math.h> to declare logl().
+  AC_REQUIRE([gl_USE_SYSTEM_EXTENSIONS])
+
+  LOGL_LIBM=
+  AC_CACHE_CHECK([whether logl() can be used without linking with libm],
+    [gl_cv_func_logl_no_libm],
+    [
+      AC_LINK_IFELSE(
+        [AC_LANG_PROGRAM(
+           [[#ifndef __NO_MATH_INLINES
+             # define __NO_MATH_INLINES 1 /* for glibc */
+             #endif
+             #include <math.h>
+             long double (*funcptr) (long double) = logl;
+             long double x;]],
+           [[return funcptr (x) > 1
+                    || logl (x) > 1;]])],
+        [gl_cv_func_logl_no_libm=yes],
+        [gl_cv_func_logl_no_libm=no])
+    ])
+  if test $gl_cv_func_logl_no_libm = no; then
+    AC_CACHE_CHECK([whether logl() can be used with libm],
+      [gl_cv_func_logl_in_libm],
+      [
+        save_LIBS="$LIBS"
+        LIBS="$LIBS -lm"
+        AC_LINK_IFELSE(
+          [AC_LANG_PROGRAM(
+             [[#ifndef __NO_MATH_INLINES
+               # define __NO_MATH_INLINES 1 /* for glibc */
+               #endif
+               #include <math.h>
+               long double (*funcptr) (long double) = logl;
+               long double x;]],
+             [[return funcptr (x) > 1
+                      || logl (x) > 1;]])],
+          [gl_cv_func_logl_in_libm=yes],
+          [gl_cv_func_logl_in_libm=no])
+        LIBS="$save_LIBS"
+      ])
+    if test $gl_cv_func_logl_in_libm = yes; then
+      LOGL_LIBM=-lm
+    fi
+  fi
+  if test $gl_cv_func_logl_no_libm = yes \
+     || test $gl_cv_func_logl_in_libm = yes; then
+    dnl Also check whether it's declared.
+    dnl Mac OS X 10.3 has logl() in libc but doesn't declare it in <math.h>.
+    AC_CHECK_DECL([logl], , [HAVE_DECL_LOGL=0], [[#include <math.h>]])
+    save_LIBS="$LIBS"
+    LIBS="$LIBS $LOGL_LIBM"
+    gl_FUNC_LOGL_WORKS
+    LIBS="$save_LIBS"
+    case "$gl_cv_func_logl_works" in
+      *yes) ;;
+      *) REPLACE_LOGL=1 ;;
+    esac
+  else
+    HAVE_LOGL=0
+    HAVE_DECL_LOGL=0
+  fi
+  if test $HAVE_LOGL = 0 || test $REPLACE_LOGL = 1; then
+    dnl Find libraries needed to link lib/logl.c.
+    if test $HAVE_SAME_LONG_DOUBLE_AS_DOUBLE = 1; then
+      AC_REQUIRE([gl_FUNC_LOG])
+      LOGL_LIBM="$LOG_LIBM"
+    else
+      if test $HAVE_LOGL = 0; then
+        AC_REQUIRE([gl_FUNC_FREXPL])
+        AC_REQUIRE([gl_FUNC_ISNANL])
+        AC_REQUIRE([gl_FUNC_FLOORL])
+        dnl Append $FREXPL_LIBM to LOGL_LIBM, avoiding gratuitous duplicates.
+        case " $LOGL_LIBM " in
+          *" $FREXPL_LIBM "*) ;;
+          *) LOGL_LIBM="$LOGL_LIBM $FREXPL_LIBM" ;;
+        esac
+        dnl Append $ISNANL_LIBM to LOGL_LIBM, avoiding gratuitous duplicates.
+        case " $LOGL_LIBM " in
+          *" $ISNANL_LIBM "*) ;;
+          *) LOGL_LIBM="$LOGL_LIBM $ISNANL_LIBM" ;;
+        esac
+        dnl Append $FLOORL_LIBM to LOGL_LIBM, avoiding gratuitous duplicates.
+        case " $LOGL_LIBM " in
+          *" $FLOORL_LIBM "*) ;;
+          *) LOGL_LIBM="$LOGL_LIBM $FLOORL_LIBM" ;;
+        esac
+      fi
+    fi
+  fi
+  AC_SUBST([LOGL_LIBM])
+])
+
+dnl Test whether logl() works.
+dnl On OSF/1 5.1, logl(-0.0L) is NaN.
+dnl On NetBSD 8.0, the result is accurate to only 16 digits.
+AC_DEFUN([gl_FUNC_LOGL_WORKS],
+[
+  AC_REQUIRE([AC_PROG_CC])
+  AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles
+  AC_CACHE_CHECK([whether logl works], [gl_cv_func_logl_works],
+    [
+      AC_RUN_IFELSE(
+        [AC_LANG_SOURCE([[
+#ifndef __NO_MATH_INLINES
+# define __NO_MATH_INLINES 1 /* for glibc */
+#endif
+#include <float.h>
+#include <math.h>
+/* Override the values of <float.h>, like done in float.in.h.  */
+#if defined __i386__ && (defined __BEOS__ || defined __OpenBSD__)
+# undef LDBL_MANT_DIG
+# define LDBL_MANT_DIG   64
+# undef LDBL_MIN_EXP
+# define LDBL_MIN_EXP    (-16381)
+# undef LDBL_MAX_EXP
+# define LDBL_MAX_EXP    16384
+#endif
+#if defined __i386__ && (defined __FreeBSD__ || defined __DragonFly__)
+# undef LDBL_MANT_DIG
+# define LDBL_MANT_DIG   64
+# undef LDBL_MIN_EXP
+# define LDBL_MIN_EXP    (-16381)
+# undef LDBL_MAX_EXP
+# define LDBL_MAX_EXP    16384
+#endif
+#if (defined _ARCH_PPC || defined _POWER) && defined _AIX && (LDBL_MANT_DIG == 
106) && defined __GNUC__
+# undef LDBL_MIN_EXP
+# define LDBL_MIN_EXP DBL_MIN_EXP
+#endif
+#if defined __sgi && (LDBL_MANT_DIG >= 106)
+# undef LDBL_MANT_DIG
+# define LDBL_MANT_DIG 106
+# if defined __GNUC__
+#  undef LDBL_MIN_EXP
+#  define LDBL_MIN_EXP DBL_MIN_EXP
+# endif
+#endif
+#undef logl
+extern
+#ifdef __cplusplus
+"C"
+#endif
+long double logl (long double);
+static long double dummy (long double x) { return 0; }
+volatile long double gx;
+long double gy;
+int main (int argc, char *argv[])
+{
+  long double (* volatile my_logl) (long double) = argc ? logl : dummy;
+  int result = 0;
+  /* This test fails on OSF/1 5.1.  */
+  {
+    gx = -0.0L;
+    gy = logl (gx);
+    if (!(gy + gy == gy))
+      result |= 1;
+  }
+  /* This test fails on NetBSD 8.0.  */
+  {
+    const long double TWO_LDBL_MANT_DIG = /* 2^LDBL_MANT_DIG */
+      (long double) (1U << ((LDBL_MANT_DIG - 1) / 5))
+      * (long double) (1U << ((LDBL_MANT_DIG - 1 + 1) / 5))
+      * (long double) (1U << ((LDBL_MANT_DIG - 1 + 2) / 5))
+      * (long double) (1U << ((LDBL_MANT_DIG - 1 + 3) / 5))
+      * (long double) (1U << ((LDBL_MANT_DIG - 1 + 4) / 5));
+    long double x = 16.981137113807045L;
+    long double err = (my_logl (x) + my_logl (1.0L / x)) * TWO_LDBL_MANT_DIG;
+    if (!(err >= -100.0L && err <= 100.0L))
+      result |= 2;
+  }
+
+  return result;
+}
+]])],
+        [gl_cv_func_logl_works=yes],
+        [gl_cv_func_logl_works=no],
+        [case "$host_os" in
+                          # Guess yes on glibc systems.
+           *-gnu* | gnu*) gl_cv_func_logl_works="guessing yes" ;;
+                          # Guess yes on musl systems.
+           *-musl*)       gl_cv_func_logl_works="guessing yes" ;;
+                          # Guess yes on native Windows.
+           mingw*)        gl_cv_func_logl_works="guessing yes" ;;
+                          # If we don't know, obey --enable-cross-guesses.
+           *)             gl_cv_func_logl_works="$gl_cross_guess_normal" ;;
+         esac
+        ])
+    ])
+])
diff --git a/src/datamash.c b/src/datamash.c
index 57844a3..9a32802 100644
--- a/src/datamash.c
+++ b/src/datamash.c
@@ -208,7 +208,7 @@ which require a pair of fields (e.g. 'pcov 2:6').\n"), 
stdout);
 
       fputs (_("Statistical Grouping operations:\n"),stdout);
       fputs ("\
-  mean, trimmean, median, q1, q3, iqr, perc, mode, antimode, \n\
+  mean, geomean, trimmean, median, q1, q3, iqr, perc, mode, antimode, \n\
   pstdev, sstdev, pvar, svar, mad, madraw,\n\
   pskew, sskew, pkurt, skurt, dpo, jarque,\n\
   scov, pcov, spearson, ppearson\n\
diff --git a/src/field-ops.c b/src/field-ops.c
index 582b9a6..dc9764d 100644
--- a/src/field-ops.c
+++ b/src/field-ops.c
@@ -76,6 +76,8 @@ struct operation_data operations[] =
   {STRING_SCALAR,  IGNORE_FIRST, STRING_RESULT},
   /* OP_MEAN */
   {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
+  /* OP_GEOMEAN */
+  {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
   /* OP_MEDIAN */
   {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
   /* OP_QUARTILE_1 */
@@ -525,6 +527,7 @@ field_op_collect (struct fieldop *op,
     case OP_S_COVARIANCE:
     case OP_P_PEARSON_COR:
     case OP_S_PEARSON_COR:
+    case OP_GEOMEAN:
     case OP_TRIMMED_MEAN:
       field_op_add_value (op, num_value);
       break;
@@ -689,6 +692,7 @@ field_op_summarize_empty (struct fieldop *op)
   switch (op->op)                                /* LCOV_EXCL_BR_LINE */
     {
     case OP_MEAN:
+    case OP_GEOMEAN:
     case OP_S_SKEWNESS:
     case OP_P_SKEWNESS:
     case OP_S_EXCESS_KURTOSIS:
@@ -859,6 +863,10 @@ field_op_summarize (struct fieldop *op)
                                           op->params.percentile / 100.0 );
       break;
 
+    case OP_GEOMEAN:
+      numeric_result = geometric_mean_value(op->values, op->num_values);
+      break;
+
     case OP_TRIMMED_MEAN:
       field_op_sort_values (op);
       numeric_result = trimmed_mean_value ( op->values, op->num_values,
diff --git a/src/field-ops.h b/src/field-ops.h
index 134aab4..6e3bb3f 100644
--- a/src/field-ops.h
+++ b/src/field-ops.h
@@ -98,7 +98,8 @@ struct fieldop
   /* NUMERIC_SCALAR operations */
   size_t count; /* number of items collected so far in a group */
   long double value; /* for single-value operations (sum, min, max, absmin,
-                        absmax, mean) - this is the accumulated value */
+                        absmax, mean, geomean) - this is the accumulated
+                        value */
 
   /* NUMERIC_VECTOR operations */
   long double *values;     /* array for multi-valued ops (median,mode,stdev) */
diff --git a/src/op-defs.c b/src/op-defs.c
index bddbcc5..13e0b7b 100644
--- a/src/op-defs.c
+++ b/src/op-defs.c
@@ -46,6 +46,7 @@ struct field_operation_definition field_operations[] =
   {"last",    OP_LAST,    MODE_GROUPBY},
   {"rand",    OP_RAND,    MODE_GROUPBY},
   {"mean",    OP_MEAN,    MODE_GROUPBY},
+  {"geomean", OP_GEOMEAN, MODE_GROUPBY},
   {"median",  OP_MEDIAN,  MODE_GROUPBY},
   {"q1",      OP_QUARTILE_1, MODE_GROUPBY},
   {"q3",      OP_QUARTILE_3, MODE_GROUPBY},
diff --git a/src/op-defs.h b/src/op-defs.h
index 898c441..21c86de 100644
--- a/src/op-defs.h
+++ b/src/op-defs.h
@@ -36,6 +36,7 @@ enum field_operation
   OP_LAST,
   OP_RAND,
   OP_MEAN,
+  OP_GEOMEAN,
   OP_MEDIAN,
   OP_QUARTILE_1,
   OP_QUARTILE_3,
diff --git a/src/utils.c b/src/utils.c
index 4b87bda..b643f9d 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -115,6 +115,18 @@ arithmetic_mean_value (const long double * const values, 
const size_t n)
   return mean;
 }
 
+/* Compute the geometric mean of an array of numbers > 0.0 */
+long double _GL_ATTRIBUTE_PURE
+geometric_mean_value (const long double * const values, const size_t n)
+{
+  long double sum=0;
+  long double mean;
+  for (size_t i = 0; i < n; i++)
+    sum += logl(values[i]);
+  mean = sum / n ;
+  return expl(mean);
+}
+
 long double _GL_ATTRIBUTE_PURE
 variance_value (const long double * const values, size_t n, int df)
 {
diff --git a/src/utils.h b/src/utils.h
index 1eee21e..c88269e 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -36,6 +36,10 @@ is_na (const char* value, const size_t len);
 long double
 arithmetic_mean_value (const long double * const values, const size_t n);
 
+/* Given an array of doubles > 0.0, return the geometric mean value */
+long double
+geometric_mean_value (const long double * const values, const size_t n);
+
 /*
  Given a sorted array of doubles, return the value of 'percentile'.
  Example of valid 'percentile':
diff --git a/tests/datamash-stats.pl b/tests/datamash-stats.pl
index d888c76..88644ac 100755
--- a/tests/datamash-stats.pl
+++ b/tests/datamash-stats.pl
@@ -297,6 +297,22 @@ my @Tests =
   ['mean12','mean 1' ,  {IN_PIPE=>$seq22}, {OUT => "67.45\n"},],
   ['mean13','mean 1' ,  {IN_PIPE=>$seq23}, {OUT => "6.125\n"},],
 
+  # Test geometric mean
+  ['geomean1', 'geomean 1' ,  {IN_PIPE=>$seq1},  {OUT => "2.213\n"}],
+  ['geomean2', 'geomean 1' ,  {IN_PIPE=>$seq2},  {OUT => "1.817\n"}],
+  ['geomean3', 'geomean 1' ,  {IN_PIPE=>$seq3},  {OUT => "2\n"}],
+  # sorted/unsorted should not change the result.
+  ['geomean4', 'geomean 1' ,  {IN_PIPE=>$seq9},  {OUT => "8.464\n"},],
+  ['geomean5', 'geomean 1' ,  {IN_PIPE=>$seq10}, {OUT => "9.573\n"}],
+  ['geomean6', 'geomean 1' ,  {IN_PIPE=>$seq12}, {OUT => "11.817\n"},],
+  ['geomean7', 'geomean 1' ,  {IN_PIPE=>$seq11}, {OUT => "10.653\n"},],
+  ['geomean8', 'geomean 1' ,  {IN_PIPE=>$seq12_unsorted}, {OUT => 
"11.817\n"},],
+  ['geomean9', '--sort geomean 1' ,
+     {IN_PIPE=>$seq12_unsorted}, {OUT => "11.817\n"},],
+  ['geomean10','geomean 1' ,  {IN_PIPE=>$seq20}, {OUT => "99.596\n"},],
+  ['geomean11','geomean 1' ,  {IN_PIPE=>$seq21}, {OUT => "34.644\n"},],
+  ['geomean12','geomean 1' ,  {IN_PIPE=>$seq22}, {OUT => "67.386\n"},],
+ 
   # Test median
   ['med1', 'median 1' ,  {IN_PIPE=>$seq1},  {OUT => "2.5\n"}],
   ['med2', 'median 1' ,  {IN_PIPE=>$seq2},  {OUT => "2\n"}],
-- 
2.17.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]