Speed of for-loops

octave-maintainers

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Speed of for-loops

From:	David Bateman
Subject:	Speed of for-loops
Date:	Mon, 30 Apr 2007 01:28:00 +0200
User-agent:	Thunderbird 1.5.0.7 (X11/20060921)

I took a look at the code n pt-loop.cc to see if there was some way to
get a speed improvement from the for-loops. I found a few possible
improvements and one error. Firstly for the error, in DO_ND_LOOP the
values j and nr are of type int, when they should be of type
octave_idx_type, otherwise 64-bit indexing over matrices won't work
correctly.

The speed-up make the most impact when the argument to the for-loop is a
matrix, though I did speed up Cell and Range types as well a little. The
speed-ups include

1) predefining several variables
2) Removing redundant assignments of the variable "quit" as it is always
defined in quit_loop_now.
3) Special case rows = 0 and 1 cases in loops over arrays.
4) Include some of the code that was separate from the macro DO_ND_LOOP
in the macro itself

I ran "make check" and there were no additionally failing tests. I
attach a short benchmarking script together with the patch. Running this
before patching, I get

octave:1> bench
Bench Range: 3.75443
Bench Vector: 5.66414
Bench Matrix: 6.74297
Bench Cell: 6.29804
octave:2> bench
Bench Range: 3.76143
Bench Vector: 5.69913
Bench Matrix: 6.74397
Bench Cell: 6.28005
octave:3> bench
Bench Range: 3.76543
Bench Vector: 5.72213
Bench Matrix: 6.79597
Bench Cell: 6.25705

and after patching I get

octave:1> bench
Bench Range: 3.63045
Bench Vector: 3.62145
Bench Matrix: 5.31619
Bench Cell: 6.16506
octave:2> bench
Bench Range: 3.62345
Bench Vector: 3.61845
Bench Matrix: 5.32819
Bench Cell: 6.21305
octave:3> bench
Bench Range: 3.65144
Bench Vector: 3.63045
Bench Matrix: 5.35918
Bench Cell: 6.19806

The reason the cell array is not improved as much at the other array
types is that the loop size is short as the cellfun call dominants the
time. Note this is a simple test and more complex loops will probably
see less improvement. However, it still seems to me that this patch has
some value..

Cheers
David

Index: src/pt-loop.cc
===================================================================
RCS file: /usr/local/cvsroot/octave/src/pt-loop.cc,v
retrieving revision 1.35
diff -u -r1.35 pt-loop.cc
--- src/pt-loop.cc      26 Apr 2007 20:23:31 -0000      1.35
+++ src/pt-loop.cc      29 Apr 2007 23:15:15 -0000
@@ -219,8 +219,6 @@
                                           const octave_value& rhs,
                                           bool& quit)
 {
-  quit = false;
-
   ult.assign (octave_value::op_asn_eq, rhs);
 
   if (! error_state)
@@ -239,54 +237,84 @@
   quit = quit_loop_now ();
 }
 
-#define DO_LOOP(arg) \
+#define DO_ND_LOOP(MTYPE, TYPE, CONV, ARG) \
   do \
     { \
-      for (int i = 0; i < steps; i++) \
-       { \
-         MAYBE_DO_BREAKPOINT; \
- \
-         octave_value val (arg); \
- \
-         bool quit = false; \
- \
-         do_for_loop_once (ult, val, quit); \
- \
-         if (quit) \
-           break; \
-       } \
-    } \
-  while (0)
-
-#define DO_ND_LOOP(TYPE, ARG) \
-  do \
-    { \
-      octave_idx_type steps = dv(1); \
- \
-      for (octave_idx_type i = 0; i < steps; i++) \
-       { \
-         MAYBE_DO_BREAKPOINT; \
- \
-          TYPE tmp; \
+      dim_vector dv = ARG.dims (); \
  \
-          int nr = ARG.rows (); \
+      bool quit = false; \
  \
-         tmp.resize (dim_vector (nr, 1)); \
+      TYPE *atmp = ARG.fortran_vec (); \
  \
-         for (int j = 0; j < nr; j++) \
-           tmp.xelem (j) = ARG.xelem (j, i); \
+      octave_idx_type steps = dv (1); \
  \
-          octave_value val (tmp); \
+      octave_idx_type nrows = dv (0); \
  \
-         bool quit = false; \
+      int ndims = dv.length (); \
+      if (ndims > 2) \
+        { \
+          for (int i = 2; i < ndims; i++) \
+            steps *= dv(i); \
+          dv (1) = steps; \
+          dv.resize (2); \
+        } \
  \
-         do_for_loop_once (ult, val, quit); \
-         quit = (i == steps - 1 ? true : quit); \
- \
-         if (quit) \
-           break; \
- \
-       } \
+      if (steps > 0) \
+       { \
+          if (nrows == 0) \
+            { \
+             octave_value val (MTYPE ( dim_vector (0, 1))); \
+ \
+             for (octave_idx_type i = 0; i < steps; i++) \
+               { \
+                 MAYBE_DO_BREAKPOINT; \
+ \
+                 do_for_loop_once (ult, val, quit); \
+ \
+                 if (quit) \
+                   break; \
+              } \
+            } \
+          else if (nrows == 1) \
+            { \
+             for (octave_idx_type i = 0; i < steps; i++) \
+               { \
+                 MAYBE_DO_BREAKPOINT; \
+ \
+                 octave_value val (CONV (*atmp++)); \
+ \
+                 do_for_loop_once (ult, val, quit); \
+ \
+                 if (quit) \
+                   break; \
+              } \
+            } \
+          else \
+            { \
+              if (ndims > 2) \
+                ARG = ARG.reshape (dv); \
+ \
+              MTYPE tmp (dim_vector (nrows, 1)); \
+ \
+              TYPE *ftmp = tmp.fortran_vec (); \
+ \
+              for (octave_idx_type i = 0; i < steps; i++) \
+               { \
+                 MAYBE_DO_BREAKPOINT; \
+ \
+                 for (int j = 0; j < nrows; j++) \
+                   ftmp[j] = *atmp++;  \
+ \
+                  octave_value val (tmp); \
+ \
+                  do_for_loop_once (ult, val, quit); \
+                  quit = (i == steps - 1 ? true : quit); \
+ \
+                 if (quit) \
+                   break; \
+               } \
+           } \
+        } \
     } \
   while (0)
 
@@ -326,17 +354,15 @@
        octave_idx_type steps = rng.nelem ();
        double b = rng.base ();
        double increment = rng.inc ();
+       bool quit = false;
+       double tmp_val = b;
 
-       for (octave_idx_type i = 0; i < steps; i++)
+       for (octave_idx_type i = 0; i < steps; i++, tmp_val += increment)
          {
            MAYBE_DO_BREAKPOINT;
 
-           double tmp_val = b + i * increment;
-
            octave_value val (tmp_val);
 
-           bool quit = false;
-
            do_for_loop_once (ult, val, quit);
 
            if (quit)
@@ -356,12 +382,25 @@
        charMatrix chm_tmp = rhs.char_matrix_value ();
        octave_idx_type nr = chm_tmp.rows ();
        octave_idx_type steps = chm_tmp.columns ();
+       bool quit = false;
 
        if (error_state)
          goto cleanup;
 
        if (nr == 1)
-         DO_LOOP (chm_tmp (0, i));
+         {
+           for (octave_idx_type i = 0; i < steps; i++)
+             {
+               MAYBE_DO_BREAKPOINT;
+
+               octave_value val (chm_tmp.xelem (0, i));
+
+               do_for_loop_once (ult, val, quit);
+
+               if (quit)
+                 break;
+             }
+         }
        else
          {
            for (octave_idx_type i = 0; i < steps; i++)
@@ -370,8 +409,6 @@
 
                octave_value val (chm_tmp.extract (0, i, nr-1, i), true);
 
-               bool quit = false;
-
                do_for_loop_once (ult, val, quit);
 
                if (quit)
@@ -381,52 +418,31 @@
       }
     else if (rhs.is_matrix_type ())
       {
-       NDArray m_tmp;
-       ComplexNDArray cm_tmp;
-       dim_vector dv;
-
        if (rhs.is_real_type ())
          {
-           m_tmp = rhs.array_value ();
-           dv = m_tmp.dims ();
-         }
-       else
-         {
-           cm_tmp = rhs.complex_array_value ();
-           dv = cm_tmp.dims ();
-         }
-
-       if (error_state)
-         goto cleanup;
+           NDArray m_tmp = rhs.array_value ();
 
-       // FIXME -- maybe we need a function for this?
-       int ndims = dv.length ();
-       for (int i = 2; i < ndims; i++)
-         dv(1) *= dv(i);
-       dv.resize (2);
+           if (error_state)
+             goto cleanup;
 
-       if (dv(1) > 0)
+           DO_ND_LOOP(NDArray, double, , m_tmp);
+         }
+       else
          {
-           if (rhs.is_real_type ())
-             {
-               if (ndims > 2)
-                 m_tmp = m_tmp.reshape (dv);
+           ComplexNDArray cm_tmp = rhs.complex_array_value ();
 
-               DO_ND_LOOP(NDArray, m_tmp);
-             }
-           else
-             {
-               if (ndims > 2)
-                 cm_tmp = cm_tmp.reshape (dv);
+           if (error_state)
+             goto cleanup;
 
-               DO_ND_LOOP(ComplexNDArray, cm_tmp);
-             }
+           DO_ND_LOOP(ComplexNDArray, Complex, , cm_tmp);
          }
       }
     else if (rhs.is_map ())
       {
        Octave_map tmp_val (rhs.map_value ());
 
+       bool quit = false;
+
        for (Octave_map::iterator p = tmp_val.begin ();
             p != tmp_val.end ();
             p++)
@@ -438,8 +454,6 @@
            octave_value val
              = (val_lst.length () == 1) ? val_lst(0) : octave_value (val_lst);
 
-           bool quit = false;
-
            do_for_loop_once (ult, val, quit);
 
            if (quit)
@@ -450,21 +464,7 @@
       {
        Cell c_tmp = rhs.cell_value ();
 
-       dim_vector dv = c_tmp.dims ();
-
-       // FIXME -- maybe we need a function for this?
-       int ndims = dv.length ();
-       for (int i = 2; i < ndims; i++)
-         dv(1) *= dv(i);
-       dv.resize (2);
-
-       if (dv(1) > 0)
-         {
-           if (ndims > 2)
-             c_tmp = c_tmp.reshape (dv);
-
-           DO_ND_LOOP(Cell, c_tmp);
-         }
+       DO_ND_LOOP (Cell, octave_value, Cell, c_tmp);
       }
     else
       {

n=1e6; 

x = 0; 
t0 = cputime(); 
for i=1:n
  x += i; 
endfor 
fprintf("Bench Range: %g\n", cputime() - t0);
fflush(stdout);

x = 0;
v = [1:n];
t0 = cputime(); 
for i=v
  x += i; 
endfor 
fprintf("Bench Vector: %g\n", cputime() - t0);
fflush(stdout);

x = [0;0];
v = [1:n;1:n];
t0 = cputime(); 
for i=v
  x += i; 
endfor 
fprintf("Bench Matrix: %g\n", cputime() - t0);
fflush(stdout);

x = 0;
c = mat2cell(1:(n/100),1,ones(1,n/100));
t0 = cputime();
for i = c
  x = cellfun (@(a) a + x, i);
endfor 
fprintf("Bench Cell: %g\n", cputime() - t0);
fflush(stdout);

[Prev in Thread]

Current Thread

[Next in Thread]

Speed of for-loops, David Bateman <=
- Speed of for-loops, John W. Eaton, 2007/04/30

Prev by Date: Re: Bug in the 'mex' garbage collector (octave 2.9.10)
Next by Date: Re: Another dynamically linked documentation patch
Previous by thread: bug in pkg, untar
Next by thread: Speed of for-loops
Index(es):
- Date
- Thread