[PATCH] Fix deadlocks in MPI reduction evaluators
Richard Guenther
rguenth at tat.physik.uni-tuebingen.de
Thu Jan 8 22:13:52 UTC 2004
Hi!
The following patch is necessary to avoid deadlocks with the MPI
implementation and multi-patch setups where one context does not
participate in the reduction.
Fixes failure of array_test_.. - I don't remember - with MPI.
Basically the scenario is that the collective synchronous MPI_Gather is
called from ReduceOverContexts<> on the non-participating (and thus
not receiving) contexts while the SendIterates are still in the
schedulers queue. The calculation participating contexts will wait for
the ReceiveIterates and patch reductions to complete using the CSem
forever then.
So the fix is to make the not participating contexts wait on the CSem,
too, by using a fake write iterate queued after the send iterates which
will trigger as soon as the send iterates complete.
Tested using MPI, Cheetah and serial some time ago.
Ok?
Richard.
2004Jan08 Richard Guenther <richard.guenther at uni-tuebingen.de>
* src/Engine/RemoteEngine.h: use a waiting iterate to wait for
reduction completion in remote single and multi-patch reduction
evaluator.
Do begin/endGeneration at the toplevel evaluate.
src/Evaluator/Reduction.h: do begin/endGeneration at the toplevel
evaluate.
--- src/Engine/RemoteEngine.h 2004-01-02 12:57:48.000000000 +0100
+++ /home/richard/src/pooma/pooma-mpi3/r2/src/Engine/RemoteEngine.h 2004-01-08 23:00:40.000000000 +0100
@@ -1954,6 +1962,29 @@
}
};
+
+template <class Expr>
+struct WaitingIterate : public Pooma::Iterate_t {
+ WaitingIterate(const Expr& e, Pooma::CountingSemaphore& csem)
+ : Pooma::Iterate_t(Pooma::scheduler()),
+ e_m(e), csem_m(csem)
+ {
+ DataObjectRequest<WriteRequest> writeReq(*this);
+ engineFunctor(e_m, writeReq);
+ }
+ virtual void run()
+ {
+ csem_m.incr();
+ }
+ virtual ~WaitingIterate()
+ {
+ DataObjectRequest<WriteRelease> writeRel;
+ engineFunctor(e_m, writeRel);
+ }
+ Expr e_m;
+ Pooma::CountingSemaphore& csem_m;
+};
+
//-----------------------------------------------------------------------------
// Single-patch Reductions involving remote engines:
//
@@ -1998,12 +2029,11 @@
Pooma::CountingSemaphore csem;
csem.height(1);
- Pooma::scheduler().beginGeneration();
-
if (Pooma::context() != computationContext)
{
expressionApply(e, RemoteSend(computationContext));
- csem.incr();
+ Pooma::Iterate_t *it = new WaitingIterate<Expr>(e, csem);
+ Pooma::scheduler().handOff(it);
}
else
{
@@ -2013,8 +2043,7 @@
forEach(e, view, TreeCombine()), csem);
}
- Pooma::scheduler().endGeneration();
-
+ // Wait for RemoteSend or Reduction to complete.
csem.wait();
RemoteProxy<T> globalRet(ret, computationContext);
@@ -2102,8 +2131,6 @@
csem.height(n);
T *vals = new T[n];
- Pooma::scheduler().beginGeneration();
-
i = inter.begin();
k = 0;
for (j = 0; j < inter.size(); j++)
@@ -2129,13 +2156,19 @@
else
{
expressionApply(e(*i), RemoteSend(computationalContext[j]));
+ // One extra RemoteSend to wait for. Maybe we can combine these
+ // iterates, but maybe not. Play safe for now.
+ csem.raise_height(1);
+ Pooma::Iterate_t *it = new WaitingIterate
+ <typename View1<Expr, INode<Expr::dimensions> >::Type_t>(e(*i), csem);
+ Pooma::scheduler().handOff(it);
}
}
++i;
}
- Pooma::scheduler().endGeneration();
+ // Wait for RemoteSends and Reductions to complete.
csem.wait();
if (n > 0)
--- src/Evaluator/Reduction.h 2003-11-21 22:30:38.000000000 +0100
+++ /home/richard/src/pooma/pooma-mpi3/r2/src/Evaluator/Reduction.h 2004-01-02 00:40:14.000000000 +0100
@@ -128,10 +128,15 @@
void evaluate(T &ret, const Op &op, const Expr &e) const
{
typedef typename EvaluatorTag1<Expr>::Evaluator_t Evaluator_t;
+
+ Pooma::scheduler().beginGeneration();
+
PAssert(checkValidity(e, WrappedInt<Expr::hasRelations>()));
forEach(e, PerformUpdateTag(), NullCombine());
Reduction<Evaluator_t>().evaluate(ret, op, e());
+ Pooma::scheduler().endGeneration();
+
POOMA_INCREMENT_STATISTIC(NumReductions)
}
};
@@ -184,12 +189,8 @@
Pooma::CountingSemaphore csem;
csem.height(1);
- Pooma::scheduler().beginGeneration();
-
evaluate(ret, op, e, csem);
- Pooma::scheduler().endGeneration();
-
csem.wait();
}
};
@@ -237,12 +238,10 @@
expressionApply(e, IntersectorTag<Inter_t>(inter));
- const int n = std::distance(inter.begin(), inter.end());
+ const int n = inter.size();
Pooma::CountingSemaphore csem;
csem.height(n);
T *vals = new T[n];
-
- Pooma::scheduler().beginGeneration();
typename Inter_t::const_iterator i = inter.begin();
int j = 0;
@@ -253,8 +252,6 @@
++i; ++j;
}
- Pooma::scheduler().endGeneration();
-
csem.wait();
ret = vals[0];
More information about the pooma-dev
mailing list