[PATCH] OpenMP loop level parallelism

freepooma-devel
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH] OpenMP loop level parallelism

From:	Richard Guenther
Subject:	[PATCH] OpenMP loop level parallelism
Date:	Tue, 23 Dec 2003 20:17:37 +0100 (CET)
On Fri, 28 Nov 2003, Richard Guenther wrote:

> The attached patch adds loop-level parallelism via OpenMP directives. It
> is tested with a full regtest using 2 threads and the Intel compiler 8.0
> on an ia32 machine with no regressions compared to non-OpenMP compilation.
> Performance and scaling was _not_ evaluated yet (I will have a 4 processor
> Itanium available within the next few weeks).

I was just able to check the performance on the 4 processor Itanium and
scaling is very good (though 4 processors is not a lot) for my CFD code:

1 CPU:  4.9s/iteration
2 CPU:  2.4s/iteration
3 CPU:  1.7s/iteration
4 CPU:  1.3s/iteration

This adds an easy way to explore parallelism in POOMA based applications
and is certainly better than non-compiling SMARTS.

> So this is a request for comments and comparisons with parallelization
> using threads from SMARTS. Anyone interested should report
> success/failure.
>
> Suggested operation is compiling the library in serial mode (with openmp
> enabled, edit the config/arch/ file) and best use single patch engines or
> at least only a single patch with multi-patch engines.
>
> Thanks for any comments,

And there were appearantly no objections in general for the patch, so, ok
to apply? (It's been through regtesting the first time I submitted it with
OpenMP enabled and since a lot more often with OpenMP disabled)

Richard.


2003Dec23  Richard Guenther <address@hidden>

        * Evaluator/InlineEvaluator.h: parallelize loops with
        #pragma omp parallel for.
        Evaluator/LoopApply.h: likewise.
        Evaluator/ReductionEvaluator.h: likewise, do final reduction
        manually.

Index: Evaluator/InlineEvaluator.h
===================================================================
RCS file: /home/pooma/Repository/r2/src/Evaluator/InlineEvaluator.h,v
retrieving revision 1.28
diff -u -u -r1.28 InlineEvaluator.h
--- Evaluator/InlineEvaluator.h 22 Oct 2003 20:43:26 -0000      1.28
+++ Evaluator/InlineEvaluator.h 27 Nov 2003 20:57:35 -0000
@@ -149,6 +149,7 @@
     LHS localLHS(lhs);
     RHS localRHS(rhs);
     int e0 = domain[0].length();
+#pragma omp parallel for if (e0 > 512)
     for (int i0=0; i0<e0; ++i0)
       op(localLHS(i0),localRHS.read(i0));
   }
@@ -164,6 +165,7 @@
     RHS localRHS(rhs);
     int e0 = domain[0].length();
     int e1 = domain[1].length();
+#pragma omp parallel for
     for (int i1=0; i1<e1; ++i1)
       for (int i0=0; i0<e0; ++i0)
        op(localLHS(i0,i1),localRHS.read(i0,i1));
@@ -182,6 +184,7 @@
     int e0 = domain[0].length();
     int e1 = domain[1].length();
     int e2 = domain[2].length();
+#pragma omp parallel for
     for (int i2=0; i2<e2; ++i2)
       for (int i1=0; i1<e1; ++i1)
        for (int i0=0; i0<e0; ++i0)
@@ -203,6 +206,7 @@
     int e1 = domain[1].length();
     int e2 = domain[2].length();
     int e3 = domain[3].length();
+#pragma omp parallel for
     for (int i3=0; i3<e3; ++i3)
       for (int i2=0; i2<e2; ++i2)
        for (int i1=0; i1<e1; ++i1)
@@ -227,6 +231,7 @@
     int e2 = domain[2].length();
     int e3 = domain[3].length();
     int e4 = domain[4].length();
+#pragma omp parallel for
     for (int i4=0; i4<e4; ++i4)
       for (int i3=0; i3<e3; ++i3)
        for (int i2=0; i2<e2; ++i2)
@@ -254,6 +259,7 @@
     int e3 = domain[3].length();
     int e4 = domain[4].length();
     int e5 = domain[5].length();
+#pragma omp parallel for
     for (int i5=0; i5<e5; ++i5)
       for (int i4=0; i4<e4; ++i4)
        for (int i3=0; i3<e3; ++i3)
@@ -285,6 +291,7 @@
     int e4 = domain[4].length();
     int e5 = domain[5].length();
     int e6 = domain[6].length();
+#pragma omp parallel for
     for (int i6=0; i6<e6; ++i6)
       for (int i5=0; i5<e5; ++i5)
        for (int i4=0; i4<e4; ++i4)
Index: Evaluator/LoopApply.h
===================================================================
RCS file: /home/pooma/Repository/r2/src/Evaluator/LoopApply.h,v
retrieving revision 1.7
diff -u -u -r1.7 LoopApply.h
--- Evaluator/LoopApply.h       22 Oct 2003 20:43:26 -0000      1.7
+++ Evaluator/LoopApply.h       27 Nov 2003 20:57:35 -0000
@@ -104,6 +104,7 @@
     CTAssert(Domain::unitStride);
     int f0 = domain[0].first();
     int e0 = domain[0].last();
+#pragma omp parallel for if (e0 > 512)
     for (int i0 = f0; i0 <= e0; ++i0)
       op(i0);
   }
@@ -116,6 +117,7 @@
     int f1 = domain[1].first();
     int e0 = domain[0].last();
     int e1 = domain[1].last();
+#pragma omp parallel for
     for (int i1 = f1; i1 <= e1; ++i1)
       for (int i0 = f0; i0 <= e0; ++i0)
        op(i0, i1);
@@ -131,6 +133,7 @@
     int e0 = domain[0].last();
     int e1 = domain[1].last();
     int e2 = domain[2].last();
+#pragma omp parallel for
     for (int i2 = f2; i2 <= e2; ++i2)
       for (int i1 = f1; i1 <= e1; ++i1)
        for (int i0 = f0; i0 <= e0; ++i0)
@@ -149,6 +152,7 @@
     int e1 = domain[1].last();
     int e2 = domain[2].last();
     int e3 = domain[3].last();
+#pragma omp parallel for
     for (int i3 = f3; i3 <= e3; ++i3)
       for (int i2 = f2; i2 <= e2; ++i2)
        for (int i1 = f1; i1 <= e1; ++i1)
@@ -170,6 +174,7 @@
     int e2 = domain[2].last();
     int e3 = domain[3].last();
     int e4 = domain[4].last();
+#pragma omp parallel for
     for (int i4 = f4; i4 <= e4; ++i4)
       for (int i3 = f3; i3 <= e3; ++i3)
        for (int i2 = f2; i2 <= e2; ++i2)
@@ -194,6 +199,7 @@
     int e3 = domain[3].last();
     int e4 = domain[4].last();
     int e5 = domain[5].last();
+#pragma omp parallel for
     for (int i5 = f5; i5 <= e5; ++i5)
       for (int i4 = f4; i4 <= e4; ++i4)
        for (int i3 = f3; i3 <= e3; ++i3)
@@ -221,6 +227,7 @@
     int e4 = domain[4].last();
     int e5 = domain[5].last();
     int e6 = domain[6].last();
+#pragma omp parallel for
     for (int i6 = f6; i6 <= e6; ++i6)
       for (int i5 = f5; i5 <= e5; ++i5)
        for (int i4 = f4; i4 <= e4; ++i4)
Index: Evaluator/ReductionEvaluator.h
===================================================================
RCS file: /home/pooma/Repository/r2/src/Evaluator/ReductionEvaluator.h,v
retrieving revision 1.9
diff -u -u -r1.9 ReductionEvaluator.h
--- Evaluator/ReductionEvaluator.h      29 Oct 2003 20:13:27 -0000      1.9
+++ Evaluator/ReductionEvaluator.h      27 Nov 2003 20:57:36 -0000
@@ -108,6 +108,56 @@
 };


+/**
+ * Class to hold static array for partial reduction results
+ * and routine for final reduction. Two versions, one dummy
+ * for non-OpenMP, one for OpenMP operation.
+ */
+
+#ifndef _OPENMP
+template<class T>
+struct PartialReduction {
+  static inline void init() {}
+  inline void storePartialResult(const T& result)
+  {
+    answer = result;
+  }
+  template <class Op>
+  inline void reduce(T& ret, const Op&)
+  {
+    ret = answer;
+  }
+  T answer;
+};
+#else
+template<class T>
+struct PartialReduction {
+  static inline void init()
+  {
+    if (!answer)
+      answer = new T[omp_get_max_threads()];
+  }
+  inline void storePartialResult(const T& result)
+  {
+    int n = omp_get_thread_num();
+    answer[n] = result;
+    if (n == 0)
+      num_threads = omp_get_num_threads();
+  }
+  template <class Op>
+  inline void reduce(T& ret, const Op& op)
+  {
+    T res = answer[0];
+    for (int i = 1; i<num_threads; ++i)
+      op(res, answer[i]);
+    ret = res;
+  }
+  int num_threads;
+  static T *answer;
+};
+template <class T>
+T *PartialReduction<T>::answer = NULL;
+#endif


 //-----------------------------------------------------------------------------
@@ -130,6 +180,7 @@
 template<>
 struct ReductionEvaluator<InlineKernelTag>
 {
+
   //---------------------------------------------------------------------------
   // Input an expression and cause it to be evaluated.
   // All this template function does is extract the domain
@@ -139,6 +190,7 @@
   inline static void evaluate(T &ret, const Op &op, const Expr &e)
   {
     typedef typename Expr::Domain_t Domain_t;
+    PartialReduction<T>::init();
     evaluate(ret, op, e, e.domain(),
       WrappedInt<Domain_t::dimensions>());
   }
@@ -171,7 +223,7 @@
   //
   // NOTE: These loops assume that the domain passed in is a unit-stride
   // domain starting at 0.  Assertions are made to make sure this is true.
-
+
   template<class T, class Op, class Expr, class Domain>
   inline static void evaluate(T &ret, const Op &op, const Expr &e,
     const Domain &domain, WrappedInt<1>)
@@ -181,9 +233,16 @@
     Expr localExpr(e);
     int e0 = domain[0].length();

-    T answer = ReductionTraits<Op, T>::identity();
-    for (int i0 = 0; i0 < e0; ++i0)
-      op(answer, localExpr.read(i0));
+    PartialReduction<T> reduction;
+#pragma omp parallel if (e0 > 512)
+    {
+      T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+      for (int i0 = 0; i0 < e0; ++i0)
+        op(answer, localExpr.read(i0));
+        reduction.storePartialResult(answer);
+    }
+    reduction.reduce(ret, op);

     ret = answer;
   }
@@ -199,12 +258,17 @@
     int e0 = domain[0].length();
     int e1 = domain[1].length();

-    T answer = ReductionTraits<Op, T>::identity();
-    for (int i1 = 0; i1 < e1; ++i1)
-      for (int i0 = 0; i0 < e0; ++i0)
-       op(answer, localExpr.read(i0, i1));
-
-    ret = answer;
+    PartialReduction<T> reduction;
+#pragma omp parallel
+    {
+      T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+      for (int i1 = 0; i1 < e1; ++i1)
+       for (int i0 = 0; i0 < e0; ++i0)
+         op(answer, localExpr.read(i0, i1));
+      reduction.storePartialResult(answer);
+    }
+    reduction.reduce(ret, op);
   }

   template<class T, class Op, class Expr, class Domain>
@@ -220,13 +284,18 @@
     int e1 = domain[1].length();
     int e2 = domain[2].length();

-    T answer = ReductionTraits<Op, T>::identity();
-    for (int i2 = 0; i2 < e2; ++i2)
-      for (int i1 = 0; i1 < e1; ++i1)
-       for (int i0 = 0; i0 < e0; ++i0)
-         op(answer, localExpr.read(i0, i1, i2));
-
-    ret = answer;
+    PartialReduction<T> reduction;
+#pragma omp parallel
+    {
+      T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+      for (int i2 = 0; i2 < e2; ++i2)
+       for (int i1 = 0; i1 < e1; ++i1)
+         for (int i0 = 0; i0 < e0; ++i0)
+           op(answer, localExpr.read(i0, i1, i2));
+      reduction.storePartialResult(answer);
+    }
+    reduction.reduce(ret, op);
   }

   template<class T, class Op, class Expr, class Domain>
@@ -244,14 +313,19 @@
     int e2 = domain[2].length();
     int e3 = domain[3].length();

-    T answer = ReductionTraits<Op, T>::identity();
-    for (int i3 = 0; i3 < e3; ++i3)
-      for (int i2 = 0; i2 < e2; ++i2)
-        for (int i1 = 0; i1 < e1; ++i1)
-         for (int i0 = 0; i0 < e0; ++i0)
-           op(answer, localExpr.read(i0, i1, i2, i3));
-
-    ret = answer;
+    PartialReduction<T> reduction;
+#pragma omp parallel
+    {
+      T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+      for (int i3 = 0; i3 < e3; ++i3)
+       for (int i2 = 0; i2 < e2; ++i2)
+         for (int i1 = 0; i1 < e1; ++i1)
+           for (int i0 = 0; i0 < e0; ++i0)
+             op(answer, localExpr.read(i0, i1, i2, i3));
+      reduction.storePartialResult(answer);
+    }
+    reduction.reduce(ret, op);
   }

   template<class T, class Op, class Expr, class Domain>
@@ -271,15 +345,20 @@
     int e3 = domain[3].length();
     int e4 = domain[4].length();

-    T answer = ReductionTraits<Op, T>::identity();
-    for (int i4 = 0; i4 < e4; ++i4)
-      for (int i3 = 0; i3 < e3; ++i3)
-        for (int i2 = 0; i2 < e2; ++i2)
-          for (int i1 = 0; i1 < e1; ++i1)
-           for (int i0 = 0; i0 < e0; ++i0)
-             op(answer, localExpr.read(i0, i1, i2, i3, i4));
-
-    ret = answer;
+    PartialReduction<T> reduction;
+#pragma omp parallel
+    {
+      T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+      for (int i4 = 0; i4 < e4; ++i4)
+       for (int i3 = 0; i3 < e3; ++i3)
+         for (int i2 = 0; i2 < e2; ++i2)
+           for (int i1 = 0; i1 < e1; ++i1)
+             for (int i0 = 0; i0 < e0; ++i0)
+               op(answer, localExpr.read(i0, i1, i2, i3, i4));
+      reduction.storePartialResult(answer);
+    }
+    reduction.reduce(ret, op);
   }

   template<class T, class Op, class Expr, class Domain>
@@ -301,16 +380,21 @@
     int e4 = domain[4].length();
     int e5 = domain[5].length();

-    T answer = ReductionTraits<Op, T>::identity();
-    for (int i5 = 0; i5 < e5; ++i5)
-      for (int i4 = 0; i4 < e4; ++i4)
-        for (int i3 = 0; i3 < e3; ++i3)
-          for (int i2 = 0; i2 < e2; ++i2)
-            for (int i1 = 0; i1 < e1; ++i1)
-             for (int i0 = 0; i0 < e0; ++i0)
-               op(answer, localExpr.read(i0, i1, i2, i3, i4, i5));
-
-    ret = answer;
+    PartialReduction<T> reduction;
+#pragma omp parallel
+    {
+      T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+      for (int i5 = 0; i5 < e5; ++i5)
+       for (int i4 = 0; i4 < e4; ++i4)
+         for (int i3 = 0; i3 < e3; ++i3)
+           for (int i2 = 0; i2 < e2; ++i2)
+             for (int i1 = 0; i1 < e1; ++i1)
+               for (int i0 = 0; i0 < e0; ++i0)
+                 op(answer, localExpr.read(i0, i1, i2, i3, i4, i5));
+      reduction.storePartialResult(answer);
+    }
+    reduction.reduce(ret, op);
   }

   template<class T, class Op, class Expr, class Domain>
@@ -334,17 +418,22 @@
     int e5 = domain[5].length();
     int e6 = domain[6].length();

-    T answer = ReductionTraits<Op, T>::identity();
-    for (int i6 = 0; i6 < e6; ++i6)
-      for (int i5 = 0; i5 < e5; ++i5)
-        for (int i4 = 0; i4 < e4; ++i4)
-          for (int i3 = 0; i3 < e3; ++i3)
-            for (int i2 = 0; i2 < e2; ++i2)
-              for (int i1 = 0; i1 < e1; ++i1)
-               for (int i0 = 0; i0 < e0; ++i0)
-                 op(answer, localExpr.read(i0, i1, i2, i3, i4, i5, i6));
-
-    ret = answer;
+    PartialReduction<T> reduction;
+#pragma omp parallel
+    {
+      T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+      for (int i6 = 0; i6 < e6; ++i6)
+       for (int i5 = 0; i5 < e5; ++i5)
+         for (int i4 = 0; i4 < e4; ++i4)
+           for (int i3 = 0; i3 < e3; ++i3)
+             for (int i2 = 0; i2 < e2; ++i2)
+               for (int i1 = 0; i1 < e1; ++i1)
+                 for (int i0 = 0; i0 < e0; ++i0)
+                   op(answer, localExpr.read(i0, i1, i2, i3, i4, i5, i6));
+      reduction.storePartialResult(answer);
+    }
+    reduction.reduce(ret, op);
   }
 };
[Prev in Thread]
Current Thread
[Next in Thread]
[PATCH] OpenMP loop level parallelism, Richard Guenther <=
Prev by Date: Re: [pooma-dev] [PATCH] Track up-to-date faces
Next by Date: [PATCH] Fix where breakage
Previous by thread: [PATCH] Track up-to-date faces
Next by thread: [PATCH] Fix where breakage
Index(es):
- Date
- Thread