lmi-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[lmi-commits] [lmi] master e5850c2a 12/13: Import Naohiko Shimizu's NetB


From: Greg Chicares
Subject: [lmi-commits] [lmi] master e5850c2a 12/13: Import Naohiko Shimizu's NetBSD performance improvements
Date: Mon, 23 May 2022 17:40:27 -0400 (EDT)

branch: master
commit e5850c2a03126de2ad62545a3b673266a3efa591
Author: Gregory W. Chicares <gchicares@sbcglobal.net>
Commit: Gregory W. Chicares <gchicares@sbcglobal.net>

    Import Naohiko Shimizu's NetBSD performance improvements
    
    The canonical netlib.org sources don't incorporate these changes,
    but they're in glibc. They account for the discrepancies observed
    between the netlib.org originals and glibc:
    
    x86_64, sse, gcc, posix
    
    test_expm1_log1p: 1000000 trials
      0 errors [worst 0.00000000000000000000000 ulp] in expm1()
      0 errors [worst 0.00000000000000000000000 ulp] in log1p()
      0 errors [worst 0.00000000000000000000000 ulp] in monthly int
      0 errors [worst 0.00000000000000000000000 ulp] in daily int
    
    and also for the speed difference, which was formerly:
    
      lmi::expm1()     8.430e-04 s mean;        819 us least of 100 runs
      std::expm1()     8.169e-04 s mean;        751 us least of 100 runs
      lmi::log1p()     8.999e-04 s mean;        871 us least of 100 runs
      std::log1p()     8.085e-04 s mean;        796 us least of 100 runs
    
    but is now negligible:
    
      lmi::expm1()     7.583e-04 s mean;        751 us least of 100 runs
      std::expm1()     7.855e-04 s mean;        762 us least of 100 runs
      lmi::log1p()     7.789e-04 s mean;        770 us least of 100 runs
      std::log1p()     7.998e-04 s mean;        797 us least of 100 runs
---
 fdlibm_expm1.c | 35 +++++++++++++++++++++++++++--------
 fdlibm_log1p.c | 40 +++++++++++++++++++++++++++++++---------
 2 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/fdlibm_expm1.c b/fdlibm_expm1.c
index 3fcec962..db3c7a45 100644
--- a/fdlibm_expm1.c
+++ b/fdlibm_expm1.c
@@ -48,6 +48,12 @@
  * ====================================================
  */
 
+// $NetBSD: s_expm1.c,v 1.8 1995/05/10 20:47:09 jtc Exp $
+/* Modified by Naohiko Shimizu/Tokai University, Japan 1997/08/25,
+   for performance improvement on pipelined processors.
+// 
https://sourceware.org/git/?p=glibc.git;a=blobdiff;f=sysdeps/libm-ieee754/s_expm1.c;h=ed1aba527be837d4ed7e65e9c86b3fc3e14e65d7;hp=c8354b7262686370a761494dc785bb59fdf45477;hb=923609d1497f3116d57b297e3e84fc07b2b15b20;hpb=0d9f67937f0c9329c35c2c0d15848ab8316dc520
+*/
+
 /* expm1(x)
  * Returns exp(x)-1, the exponential of x minus 1.
  *
@@ -144,7 +150,6 @@
  */
 
 static const double
-one         = 1.0,
 huge        = 1.0e+300,
 tiny        = 1.0e-300,
 o_threshold = 7.09782712893383973096e+02,/* 0x40862E42, 0xFEFA39EF */
@@ -152,15 +157,24 @@ ln2_hi      = 6.93147180369123816490e-01,/* 0x3fe62e42, 
0xfee00000 */
 ln2_lo      = 1.90821492927058770002e-10,/* 0x3dea39ef, 0x35793c76 */
 invln2      = 1.44269504088896338700e+00,/* 0x3ff71547, 0x652b82fe */
         /* scaled coefficients related to expm1 */
-Q1  =  -3.33333333333331316428e-02, /* BFA11111 111110F4 */
-Q2  =   1.58730158725481460165e-03, /* 3F5A01A0 19FE5585 */
-Q3  =  -7.93650757867487942473e-05, /* BF14CE19 9EAADBB7 */
-Q4  =   4.00821782732936239552e-06, /* 3ED0CFCA 86E65239 */
-Q5  =  -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */
+Q[] = { 1.0
+      ,-3.33333333333331316428e-02 /* BFA11111 111110F4 */
+      , 1.58730158725481460165e-03 /* 3F5A01A0 19FE5585 */
+      ,-7.93650757867487942473e-05 /* BF14CE19 9EAADBB7 */
+      , 4.00821782732936239552e-06 /* 3ED0CFCA 86E65239 */
+      ,-2.01099218183624371326e-07 /* BE8AFDB7 6E09C32D */
+};
+
+#define one Q[0]
+#define Q1  Q[1]
+#define Q2  Q[2]
+#define Q3  Q[3]
+#define Q4  Q[4]
+#define Q5  Q[5]
 
 double fdlibm_expm1(double x)
 {
-    double y,hi,lo,c,t,e,hxs,hfx,r1;
+    double y,hi,lo,c,t,e,hxs,hfx,r1,h2,h4,R1,R2,R3;
     int32_t k,xsb;
     uint32_t hx;
 
@@ -210,7 +224,12 @@ double fdlibm_expm1(double x)
     /* x is now in primary range */
     hfx = 0.5*x;
     hxs = x*hfx;
-    r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+//  performance improvement: Naohiko Shimizu 19970825
+//  r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+    R1 = one+hxs*Q1; h2 = hxs*hxs;
+    R2 = Q2+hxs*Q3; h4 = h2*h2;
+    R3 = Q4+hxs*Q5;
+    r1 = R1 + h2*R2 + h4*R3;
     t  = 3.0-r1*hfx;
     e  = hxs*((r1-t)/(6.0 - x*t));
     if(k==0) return x - (x*e-hxs); /* c is 0 */
diff --git a/fdlibm_log1p.c b/fdlibm_log1p.c
index ec6a1499..608ba12b 100644
--- a/fdlibm_log1p.c
+++ b/fdlibm_log1p.c
@@ -47,6 +47,12 @@
  * ====================================================
  */
 
+// $NetBSD: s_log1p.c,v 1.8 1995/05/10 20:47:46 jtc Exp $
+/* Modified by Naohiko Shimizu/Tokai University, Japan 1997/08/25,
+   for performance improvement on pipelined processors.
+// 
https://sourceware.org/git/?p=glibc.git;a=blobdiff;f=sysdeps/libm-ieee754/s_log1p.c;h=4ca01b1ff555693a3fa3459df82bef1c24c82668;hp=086c0dce6c118e2364aa883d0997d32493d43a97;hb=923609d1497f3116d57b297e3e84fc07b2b15b20;hpb=0d9f67937f0c9329c35c2c0d15848ab8316dc520
+*/
+
 /* double log1p(double x)
  *
  * Method :
@@ -116,19 +122,29 @@ static const double
 ln2_hi  =  6.93147180369123816490e-01,  /* 3fe62e42 fee00000 */
 ln2_lo  =  1.90821492927058770002e-10,  /* 3dea39ef 35793c76 */
 two54   =  1.80143985094819840000e+16,  /* 43500000 00000000 */
-Lp1 = 6.666666666666735130e-01,  /* 3FE55555 55555593 */
-Lp2 = 3.999999999940941908e-01,  /* 3FD99999 9997FA04 */
-Lp3 = 2.857142874366239149e-01,  /* 3FD24924 94229359 */
-Lp4 = 2.222219843214978396e-01,  /* 3FCC71C5 1D8E78AF */
-Lp5 = 1.818357216161805012e-01,  /* 3FC74664 96CB03DE */
-Lp6 = 1.531383769920937332e-01,  /* 3FC39A09 D078C69F */
-Lp7 = 1.479819860511658591e-01;  /* 3FC2F112 DF3E5244 */
+Lp[] = {0.0 // not used
+       ,6.666666666666735130e-01 /* 3FE55555 55555593 */
+       ,3.999999999940941908e-01 /* 3FD99999 9997FA04 */
+       ,2.857142874366239149e-01 /* 3FD24924 94229359 */
+       ,2.222219843214978396e-01 /* 3FCC71C5 1D8E78AF */
+       ,1.818357216161805012e-01 /* 3FC74664 96CB03DE */
+       ,1.531383769920937332e-01 /* 3FC39A09 D078C69F */
+       ,1.479819860511658591e-01 /* 3FC2F112 DF3E5244 */
+};
+
+#define Lp1 Lp[1]
+#define Lp2 Lp[2]
+#define Lp3 Lp[3]
+#define Lp4 Lp[4]
+#define Lp5 Lp[5]
+#define Lp6 Lp[6]
+#define Lp7 Lp[7]
 
 static double zero = 0.0;
 
 double fdlibm_log1p(double x)
 {
-    double hfsq,f,c,s,z,R,u;
+    double hfsq,f,c,s,z,R,u,z2,z4,z6,R1,R2,R3,R4;
     int32_t k,hx,hu,ax;
 
     hx = FDLIBM_HI(x);        /* high word of x */
@@ -187,7 +203,13 @@ double fdlibm_log1p(double x)
     }
     s = f/(2.0+f);
     z = s*s;
-    R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
+//  performance improvement: Naohiko Shimizu 19970825
+//  R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
+    R1 =     z*Lp1; z2=z*z;
+    R2 = Lp2+z*Lp3; z4=z2*z2;
+    R3 = Lp4+z*Lp5; z6=z4*z2;
+    R4 = Lp6+z*Lp7;
+    R = R1 + z2*R2 + z4*R3 + z6*R4;
     if(k==0) return f-(hfsq-s*(hfsq+R)); else
              return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
 }



reply via email to

[Prev in Thread] Current Thread [Next in Thread]