[pooma-dev] Good News. Intel's ICC 8.0 Beta looks promising, now.
Richard Guenther
rguenth at tat.physik.uni-tuebingen.de
Tue Jun 3 20:41:26 UTC 2003
On Tue, 3 Jun 2003, Paul A. Renard wrote:
> Richard:
>
> From your message:
> Unfortunately my tests show its better, but still worse than with gcc.
> Your test is 1d, try 3d and it starts to suck. Inlining is still the
> culprit, as is CSE with f.i. Loc<n> (where n>1) objects.
>
>
> Actually, my test is 2D. Do you have a 3D test you can send? Were you comparing
> icc 8.0? I'd like to try your test on my machine with KCC and icc 8.0
Yes, attached. I tested icc8.0 and gcc3.3 (patched). I'd be interested in
KCC results, too.
With gcc I get
Benchmark size 262144:
ET: 5.55688e-08
Stencil: 6.05278e-08
ScalarCode (int): 7.5695e-08
ScalarCode (Loc): 1.15906e-07
Benchmark size 2097152:
ET: 5.74374e-08
Stencil: 6.38685e-08
ScalarCode (int): 7.94697e-08
ScalarCode (Loc): 1.19308e-07
Benchmark size 262144:
ET: 7.75644e-08
Stencil: 7.78923e-08
ScalarCode (int): 6.76191e-08
ScalarCode (Loc): 1.55674e-07
Benchmark size 2097152:
ET: 6.99201e-08
Stencil: 7.7395e-08
ScalarCode (int): 6.24175e-08
ScalarCode (Loc): 1.54993e-07
Total (sum) s/iteration 1.37126e-06
with icc
Benchmark size 262144:
ET: 7.37382e-08
Stencil: 7.42148e-08
ScalarCode (int): 8.37249e-08
ScalarCode (Loc): 9.26857e-08
Benchmark size 2097152:
ET: 8.0122e-08
Stencil: 7.84069e-08
ScalarCode (int): 8.49171e-08
ScalarCode (Loc): 9.70053e-08
Benchmark size 262144:
ET: 1.14643e-07
Stencil: 9.76029e-08
ScalarCode (int): 6.61776e-08
ScalarCode (Loc): 1.42822e-07
Benchmark size 2097152:
ET: 1.13272e-07
Stencil: 9.84888e-08
ScalarCode (int): 5.80321e-08
ScalarCode (Loc): 1.41148e-07
Total (sum) s/iteration 1.497e-06
While the 1d Loc using ScalarCode are better with icc, the 3d expression
template versions are awfully slow (filed a PR already).
Richard.
-------------- next part --------------
#include "Pooma/Pooma.h"
#include "Pooma/Arrays.h"
#include "Utilities/Clock.h"
template <class A1, class A2>
void benchET(const A1& a, const A2& b)
{
asm("benchET_begin:");
Interval<1> I = a.physicalDomain();
Loc<1> dX = Loc<1>(1);
b(I) = 0.5 * (a.read(I-dX) + a.read(I+dX));
asm("benchET_end:");
}
struct MyStencil {
MyStencil() {};
template <class A1>
inline typename A1::Element_t operator()(const A1& a, int i) const
{
return 0.5 * (a.read(i-1) + a.read(i+1));
}
inline int lowerExtent(int) const { return 1; }
inline int upperExtent(int) const { return 1; }
};
template <class A1, class A2>
void benchStencil(const A1& a, const A2& b)
{
asm("benchStencil_begin:");
Interval<1> I = a.physicalDomain();
b(I) = Stencil<MyStencil>()(a)(I);
asm("benchStencil_end:");
}
struct MyScalarCodeLoc {
MyScalarCodeLoc() {};
void scalarCodeInfo(ScalarCodeInfo<1, 2>& i) const
{
i.extent(GuardLayers<1>(1));
i.write(0, true);
i.write(1, false);
i.useGuards(0, false);
i.useGuards(1, true);
}
static const Loc<1> dX;
template <class A1, class A2>
inline void operator()(const A1& a, const A2& b, const Loc<1>& I) const
{
b(I) = 0.5 * (a.read(I-dX) + a.read(I+dX));
}
};
const Loc<1> MyScalarCodeLoc::dX = Loc<1>(1);
struct MyScalarCodeInt {
MyScalarCodeInt() {};
void scalarCodeInfo(ScalarCodeInfo<1, 2>& i) const
{
i.extent(GuardLayers<1>(1));
i.write(0, true);
i.write(1, false);
i.useGuards(0, false);
i.useGuards(1, true);
}
template <class A1, class A2>
inline void operator()(const A1& a, const A2& b, const Loc<1>& I) const
{
int i = I.first();
b(i) = 0.5 * (a.read(i-1) + a.read(i+1));
}
};
template <class A1, class A2>
void benchScalarCodeLoc(const A1& a, const A2& b)
{
asm("benchScalarCodeLoc_begin:");
Interval<1> I = a.physicalDomain();
ScalarCode<MyScalarCodeLoc>()(a, b);
asm("benchScalarCodeLoc_end:");
}
template <class A1, class A2>
void benchScalarCodeInt(const A1& a, const A2& b)
{
asm("benchScalarCodeInt_begin:");
Interval<1> I = a.physicalDomain();
ScalarCode<MyScalarCodeInt>()(a, b);
asm("benchScalarCodeInt_end:");
}
void bench(int size)
{
Interval<1> domain = Interval<1>(size);
GridLayout<1> layout = GridLayout<1>(domain, Loc<1>(8), GuardLayers<1>(1), ReplicatedTag());
Array<1, double, MultiPatch<GridTag, Brick> > A(layout), B(layout);
A(A.domain()) = 1.0;
B(domain) = 1.0;
if (!all(B(domain) == 1.0))
exit(1);
double startET = Pooma::Clock::value();
benchET(A, B);
double endET = Pooma::Clock::value();
if (!all(B(domain) == 1.0))
exit(1);
double startStencil = Pooma::Clock::value();
benchStencil(A, B);
double endStencil = Pooma::Clock::value();
if (!all(B(domain) == 1.0))
exit(1);
double startScalarCodeInt = Pooma::Clock::value();
benchScalarCodeInt(A, B);
double endScalarCodeInt = Pooma::Clock::value();
if (!all(B(domain) == 1.0))
exit(1);
double startScalarCodeLoc = Pooma::Clock::value();
benchScalarCodeLoc(A, B);
double endScalarCodeLoc = Pooma::Clock::value();
if (!all(B(domain) == 1.0))
exit(1);
Inform out;
out << "Benchmark size " << size << ":" << std::endl;
out << " ET: "
<< (endET - startET)/size << std::endl;
out << " Stencil: "
<< (endStencil - startStencil)/size << std::endl;
out << " ScalarCode (int): "
<< (endScalarCodeInt - startScalarCodeInt)/size << std::endl;
out << " ScalarCode (Loc): "
<< (endScalarCodeLoc - startScalarCodeLoc)/size << std::endl;
}
int main(int argc, char **argv)
{
Pooma::initialize(argc, argv);
Pooma::blockingExpressions(true);
bench(32*32*32);
bench(32*32*32*10);
bench(32*32*32*100);
Pooma::finalize();
return 0;
}
More information about the pooma-dev
mailing list