[RFC] OpenMP loop level parallelism
Richard Guenther
rguenth at tat.physik.uni-tuebingen.de
Fri Nov 28 17:17:13 UTC 2003
Hi!
The attached patch adds loop-level parallelism via OpenMP directives. It
is tested with a full regtest using 2 threads and the Intel compiler 8.0
on an ia32 machine with no regressions compared to non-OpenMP compilation.
Performance and scaling was _not_ evaluated yet (I will have a 4 processor
Itanium available within the next few weeks).
So this is a request for comments and comparisons with parallelization
using threads from SMARTS. Anyone interested should report
success/failure.
Suggested operation is compiling the library in serial mode (with openmp
enabled, edit the config/arch/ file) and best use single patch engines or
at least only a single patch with multi-patch engines.
Anyone with access to an SGI Origin or SGI Altix machine with lots of
processors?
Thanks for any comments,
Richard.
-------------- next part --------------
Index: Evaluator/InlineEvaluator.h
===================================================================
RCS file: /home/pooma/Repository/r2/src/Evaluator/InlineEvaluator.h,v
retrieving revision 1.28
diff -u -u -r1.28 InlineEvaluator.h
--- Evaluator/InlineEvaluator.h 22 Oct 2003 20:43:26 -0000 1.28
+++ Evaluator/InlineEvaluator.h 27 Nov 2003 20:57:35 -0000
@@ -149,6 +149,7 @@
LHS localLHS(lhs);
RHS localRHS(rhs);
int e0 = domain[0].length();
+#pragma omp parallel for if (e0 > 512)
for (int i0=0; i0<e0; ++i0)
op(localLHS(i0),localRHS.read(i0));
}
@@ -164,6 +165,7 @@
RHS localRHS(rhs);
int e0 = domain[0].length();
int e1 = domain[1].length();
+#pragma omp parallel for
for (int i1=0; i1<e1; ++i1)
for (int i0=0; i0<e0; ++i0)
op(localLHS(i0,i1),localRHS.read(i0,i1));
@@ -182,6 +184,7 @@
int e0 = domain[0].length();
int e1 = domain[1].length();
int e2 = domain[2].length();
+#pragma omp parallel for
for (int i2=0; i2<e2; ++i2)
for (int i1=0; i1<e1; ++i1)
for (int i0=0; i0<e0; ++i0)
@@ -203,6 +206,7 @@
int e1 = domain[1].length();
int e2 = domain[2].length();
int e3 = domain[3].length();
+#pragma omp parallel for
for (int i3=0; i3<e3; ++i3)
for (int i2=0; i2<e2; ++i2)
for (int i1=0; i1<e1; ++i1)
@@ -227,6 +231,7 @@
int e2 = domain[2].length();
int e3 = domain[3].length();
int e4 = domain[4].length();
+#pragma omp parallel for
for (int i4=0; i4<e4; ++i4)
for (int i3=0; i3<e3; ++i3)
for (int i2=0; i2<e2; ++i2)
@@ -254,6 +259,7 @@
int e3 = domain[3].length();
int e4 = domain[4].length();
int e5 = domain[5].length();
+#pragma omp parallel for
for (int i5=0; i5<e5; ++i5)
for (int i4=0; i4<e4; ++i4)
for (int i3=0; i3<e3; ++i3)
@@ -285,6 +291,7 @@
int e4 = domain[4].length();
int e5 = domain[5].length();
int e6 = domain[6].length();
+#pragma omp parallel for
for (int i6=0; i6<e6; ++i6)
for (int i5=0; i5<e5; ++i5)
for (int i4=0; i4<e4; ++i4)
Index: Evaluator/LoopApply.h
===================================================================
RCS file: /home/pooma/Repository/r2/src/Evaluator/LoopApply.h,v
retrieving revision 1.7
diff -u -u -r1.7 LoopApply.h
--- Evaluator/LoopApply.h 22 Oct 2003 20:43:26 -0000 1.7
+++ Evaluator/LoopApply.h 27 Nov 2003 20:57:35 -0000
@@ -104,6 +104,7 @@
CTAssert(Domain::unitStride);
int f0 = domain[0].first();
int e0 = domain[0].last();
+#pragma omp parallel for if (e0 > 512)
for (int i0 = f0; i0 <= e0; ++i0)
op(i0);
}
@@ -116,6 +117,7 @@
int f1 = domain[1].first();
int e0 = domain[0].last();
int e1 = domain[1].last();
+#pragma omp parallel for
for (int i1 = f1; i1 <= e1; ++i1)
for (int i0 = f0; i0 <= e0; ++i0)
op(i0, i1);
@@ -131,6 +133,7 @@
int e0 = domain[0].last();
int e1 = domain[1].last();
int e2 = domain[2].last();
+#pragma omp parallel for
for (int i2 = f2; i2 <= e2; ++i2)
for (int i1 = f1; i1 <= e1; ++i1)
for (int i0 = f0; i0 <= e0; ++i0)
@@ -149,6 +152,7 @@
int e1 = domain[1].last();
int e2 = domain[2].last();
int e3 = domain[3].last();
+#pragma omp parallel for
for (int i3 = f3; i3 <= e3; ++i3)
for (int i2 = f2; i2 <= e2; ++i2)
for (int i1 = f1; i1 <= e1; ++i1)
@@ -170,6 +174,7 @@
int e2 = domain[2].last();
int e3 = domain[3].last();
int e4 = domain[4].last();
+#pragma omp parallel for
for (int i4 = f4; i4 <= e4; ++i4)
for (int i3 = f3; i3 <= e3; ++i3)
for (int i2 = f2; i2 <= e2; ++i2)
@@ -194,6 +199,7 @@
int e3 = domain[3].last();
int e4 = domain[4].last();
int e5 = domain[5].last();
+#pragma omp parallel for
for (int i5 = f5; i5 <= e5; ++i5)
for (int i4 = f4; i4 <= e4; ++i4)
for (int i3 = f3; i3 <= e3; ++i3)
@@ -221,6 +227,7 @@
int e4 = domain[4].last();
int e5 = domain[5].last();
int e6 = domain[6].last();
+#pragma omp parallel for
for (int i6 = f6; i6 <= e6; ++i6)
for (int i5 = f5; i5 <= e5; ++i5)
for (int i4 = f4; i4 <= e4; ++i4)
Index: Evaluator/ReductionEvaluator.h
===================================================================
RCS file: /home/pooma/Repository/r2/src/Evaluator/ReductionEvaluator.h,v
retrieving revision 1.9
diff -u -u -r1.9 ReductionEvaluator.h
--- Evaluator/ReductionEvaluator.h 29 Oct 2003 20:13:27 -0000 1.9
+++ Evaluator/ReductionEvaluator.h 27 Nov 2003 20:57:36 -0000
@@ -108,6 +108,56 @@
};
+/**
+ * Class to hold static array for partial reduction results
+ * and routine for final reduction. Two versions, one dummy
+ * for non-OpenMP, one for OpenMP operation.
+ */
+
+#ifndef _OPENMP
+template<class T>
+struct PartialReduction {
+ static inline void init() {}
+ inline void storePartialResult(const T& result)
+ {
+ answer = result;
+ }
+ template <class Op>
+ inline void reduce(T& ret, const Op&)
+ {
+ ret = answer;
+ }
+ T answer;
+};
+#else
+template<class T>
+struct PartialReduction {
+ static inline void init()
+ {
+ if (!answer)
+ answer = new T[omp_get_max_threads()];
+ }
+ inline void storePartialResult(const T& result)
+ {
+ int n = omp_get_thread_num();
+ answer[n] = result;
+ if (n == 0)
+ num_threads = omp_get_num_threads();
+ }
+ template <class Op>
+ inline void reduce(T& ret, const Op& op)
+ {
+ T res = answer[0];
+ for (int i = 1; i<num_threads; ++i)
+ op(res, answer[i]);
+ ret = res;
+ }
+ int num_threads;
+ static T *answer;
+};
+template <class T>
+T *PartialReduction<T>::answer = NULL;
+#endif
//-----------------------------------------------------------------------------
@@ -130,6 +180,7 @@
template<>
struct ReductionEvaluator<InlineKernelTag>
{
+
//---------------------------------------------------------------------------
// Input an expression and cause it to be evaluated.
// All this template function does is extract the domain
@@ -139,6 +190,7 @@
inline static void evaluate(T &ret, const Op &op, const Expr &e)
{
typedef typename Expr::Domain_t Domain_t;
+ PartialReduction<T>::init();
evaluate(ret, op, e, e.domain(),
WrappedInt<Domain_t::dimensions>());
}
@@ -171,7 +223,7 @@
//
// NOTE: These loops assume that the domain passed in is a unit-stride
// domain starting at 0. Assertions are made to make sure this is true.
-
+
template<class T, class Op, class Expr, class Domain>
inline static void evaluate(T &ret, const Op &op, const Expr &e,
const Domain &domain, WrappedInt<1>)
@@ -181,9 +233,16 @@
Expr localExpr(e);
int e0 = domain[0].length();
- T answer = ReductionTraits<Op, T>::identity();
- for (int i0 = 0; i0 < e0; ++i0)
- op(answer, localExpr.read(i0));
+ PartialReduction<T> reduction;
+#pragma omp parallel if (e0 > 512)
+ {
+ T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+ for (int i0 = 0; i0 < e0; ++i0)
+ op(answer, localExpr.read(i0));
+ reduction.storePartialResult(answer);
+ }
+ reduction.reduce(ret, op);
ret = answer;
}
@@ -199,12 +258,17 @@
int e0 = domain[0].length();
int e1 = domain[1].length();
- T answer = ReductionTraits<Op, T>::identity();
- for (int i1 = 0; i1 < e1; ++i1)
- for (int i0 = 0; i0 < e0; ++i0)
- op(answer, localExpr.read(i0, i1));
-
- ret = answer;
+ PartialReduction<T> reduction;
+#pragma omp parallel
+ {
+ T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+ for (int i1 = 0; i1 < e1; ++i1)
+ for (int i0 = 0; i0 < e0; ++i0)
+ op(answer, localExpr.read(i0, i1));
+ reduction.storePartialResult(answer);
+ }
+ reduction.reduce(ret, op);
}
template<class T, class Op, class Expr, class Domain>
@@ -220,13 +284,18 @@
int e1 = domain[1].length();
int e2 = domain[2].length();
- T answer = ReductionTraits<Op, T>::identity();
- for (int i2 = 0; i2 < e2; ++i2)
- for (int i1 = 0; i1 < e1; ++i1)
- for (int i0 = 0; i0 < e0; ++i0)
- op(answer, localExpr.read(i0, i1, i2));
-
- ret = answer;
+ PartialReduction<T> reduction;
+#pragma omp parallel
+ {
+ T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+ for (int i2 = 0; i2 < e2; ++i2)
+ for (int i1 = 0; i1 < e1; ++i1)
+ for (int i0 = 0; i0 < e0; ++i0)
+ op(answer, localExpr.read(i0, i1, i2));
+ reduction.storePartialResult(answer);
+ }
+ reduction.reduce(ret, op);
}
template<class T, class Op, class Expr, class Domain>
@@ -244,14 +313,19 @@
int e2 = domain[2].length();
int e3 = domain[3].length();
- T answer = ReductionTraits<Op, T>::identity();
- for (int i3 = 0; i3 < e3; ++i3)
- for (int i2 = 0; i2 < e2; ++i2)
- for (int i1 = 0; i1 < e1; ++i1)
- for (int i0 = 0; i0 < e0; ++i0)
- op(answer, localExpr.read(i0, i1, i2, i3));
-
- ret = answer;
+ PartialReduction<T> reduction;
+#pragma omp parallel
+ {
+ T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+ for (int i3 = 0; i3 < e3; ++i3)
+ for (int i2 = 0; i2 < e2; ++i2)
+ for (int i1 = 0; i1 < e1; ++i1)
+ for (int i0 = 0; i0 < e0; ++i0)
+ op(answer, localExpr.read(i0, i1, i2, i3));
+ reduction.storePartialResult(answer);
+ }
+ reduction.reduce(ret, op);
}
template<class T, class Op, class Expr, class Domain>
@@ -271,15 +345,20 @@
int e3 = domain[3].length();
int e4 = domain[4].length();
- T answer = ReductionTraits<Op, T>::identity();
- for (int i4 = 0; i4 < e4; ++i4)
- for (int i3 = 0; i3 < e3; ++i3)
- for (int i2 = 0; i2 < e2; ++i2)
- for (int i1 = 0; i1 < e1; ++i1)
- for (int i0 = 0; i0 < e0; ++i0)
- op(answer, localExpr.read(i0, i1, i2, i3, i4));
-
- ret = answer;
+ PartialReduction<T> reduction;
+#pragma omp parallel
+ {
+ T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+ for (int i4 = 0; i4 < e4; ++i4)
+ for (int i3 = 0; i3 < e3; ++i3)
+ for (int i2 = 0; i2 < e2; ++i2)
+ for (int i1 = 0; i1 < e1; ++i1)
+ for (int i0 = 0; i0 < e0; ++i0)
+ op(answer, localExpr.read(i0, i1, i2, i3, i4));
+ reduction.storePartialResult(answer);
+ }
+ reduction.reduce(ret, op);
}
template<class T, class Op, class Expr, class Domain>
@@ -301,16 +380,21 @@
int e4 = domain[4].length();
int e5 = domain[5].length();
- T answer = ReductionTraits<Op, T>::identity();
- for (int i5 = 0; i5 < e5; ++i5)
- for (int i4 = 0; i4 < e4; ++i4)
- for (int i3 = 0; i3 < e3; ++i3)
- for (int i2 = 0; i2 < e2; ++i2)
- for (int i1 = 0; i1 < e1; ++i1)
- for (int i0 = 0; i0 < e0; ++i0)
- op(answer, localExpr.read(i0, i1, i2, i3, i4, i5));
-
- ret = answer;
+ PartialReduction<T> reduction;
+#pragma omp parallel
+ {
+ T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+ for (int i5 = 0; i5 < e5; ++i5)
+ for (int i4 = 0; i4 < e4; ++i4)
+ for (int i3 = 0; i3 < e3; ++i3)
+ for (int i2 = 0; i2 < e2; ++i2)
+ for (int i1 = 0; i1 < e1; ++i1)
+ for (int i0 = 0; i0 < e0; ++i0)
+ op(answer, localExpr.read(i0, i1, i2, i3, i4, i5));
+ reduction.storePartialResult(answer);
+ }
+ reduction.reduce(ret, op);
}
template<class T, class Op, class Expr, class Domain>
@@ -334,17 +418,22 @@
int e5 = domain[5].length();
int e6 = domain[6].length();
- T answer = ReductionTraits<Op, T>::identity();
- for (int i6 = 0; i6 < e6; ++i6)
- for (int i5 = 0; i5 < e5; ++i5)
- for (int i4 = 0; i4 < e4; ++i4)
- for (int i3 = 0; i3 < e3; ++i3)
- for (int i2 = 0; i2 < e2; ++i2)
- for (int i1 = 0; i1 < e1; ++i1)
- for (int i0 = 0; i0 < e0; ++i0)
- op(answer, localExpr.read(i0, i1, i2, i3, i4, i5, i6));
-
- ret = answer;
+ PartialReduction<T> reduction;
+#pragma omp parallel
+ {
+ T answer = ReductionTraits<Op, T>::identity();
+#pragma omp for nowait
+ for (int i6 = 0; i6 < e6; ++i6)
+ for (int i5 = 0; i5 < e5; ++i5)
+ for (int i4 = 0; i4 < e4; ++i4)
+ for (int i3 = 0; i3 < e3; ++i3)
+ for (int i2 = 0; i2 < e2; ++i2)
+ for (int i1 = 0; i1 < e1; ++i1)
+ for (int i0 = 0; i0 < e0; ++i0)
+ op(answer, localExpr.read(i0, i1, i2, i3, i4, i5, i6));
+ reduction.storePartialResult(answer);
+ }
+ reduction.reduce(ret, op);
}
};
More information about the pooma-dev
mailing list