[pooma-dev] Good News. Intel's ICC 8.0 Beta looks promising, now.

Tue Jun 3 20:41:26 UTC 2003

On Tue, 3 Jun 2003, Paul A. Renard wrote:

> Richard:
>
> From your message:
>       Unfortunately my tests show its better, but still worse than with gcc.
>       Your test is 1d, try 3d and it starts to suck. Inlining is still the
>       culprit, as is CSE with f.i. Loc<n> (where n>1) objects.
>
>
> Actually, my test is 2D.  Do you have a 3D test you can send?  Were you comparing
> icc 8.0?  I'd like to try your test on my machine with KCC and icc 8.0

Yes, attached. I tested icc8.0 and gcc3.3 (patched). I'd be interested in
KCC results, too.

With gcc I get

Benchmark size 262144:
  ET:               5.55688e-08
  Stencil:          6.05278e-08
  ScalarCode (int): 7.5695e-08
  ScalarCode (Loc): 1.15906e-07
Benchmark size 2097152:
  ET:               5.74374e-08
  Stencil:          6.38685e-08
  ScalarCode (int): 7.94697e-08
  ScalarCode (Loc): 1.19308e-07
Benchmark size 262144:
  ET:               7.75644e-08
  Stencil:          7.78923e-08
  ScalarCode (int): 6.76191e-08
  ScalarCode (Loc): 1.55674e-07
Benchmark size 2097152:
  ET:               6.99201e-08
  Stencil:          7.7395e-08
  ScalarCode (int): 6.24175e-08
  ScalarCode (Loc): 1.54993e-07
Total (sum) s/iteration 1.37126e-06

with icc

Benchmark size 262144:
  ET:               7.37382e-08
  Stencil:          7.42148e-08
  ScalarCode (int): 8.37249e-08
  ScalarCode (Loc): 9.26857e-08
Benchmark size 2097152:
  ET:               8.0122e-08
  Stencil:          7.84069e-08
  ScalarCode (int): 8.49171e-08
  ScalarCode (Loc): 9.70053e-08
Benchmark size 262144:
  ET:               1.14643e-07
  Stencil:          9.76029e-08
  ScalarCode (int): 6.61776e-08
  ScalarCode (Loc): 1.42822e-07
Benchmark size 2097152:
  ET:               1.13272e-07
  Stencil:          9.84888e-08
  ScalarCode (int): 5.80321e-08
  ScalarCode (Loc): 1.41148e-07
Total (sum) s/iteration 1.497e-06

While the 1d Loc using ScalarCode are better with icc, the 3d expression
template versions are awfully slow (filed a PR already).

Richard.

-------------- next part --------------
#include "Pooma/Pooma.h"
#include "Pooma/Arrays.h"
#include "Utilities/Clock.h"

template <class A1, class A2>
void benchET(const A1& a, const A2& b)
{
	asm("benchET_begin:");
	Interval<1> I = a.physicalDomain();
	Loc<1> dX = Loc<1>(1);

	b(I) = 0.5 * (a.read(I-dX) + a.read(I+dX));
	asm("benchET_end:");
}

struct MyStencil {
	MyStencil() {};
	template <class A1>
	inline typename A1::Element_t operator()(const A1& a, int i) const
	{
		return 0.5 * (a.read(i-1) + a.read(i+1));
	}
	inline int lowerExtent(int) const { return 1; }
	inline int upperExtent(int) const { return 1; }
};

template <class A1, class A2>
void benchStencil(const A1& a, const A2& b)
{
	asm("benchStencil_begin:");
	Interval<1> I = a.physicalDomain();

	b(I) = Stencil<MyStencil>()(a)(I);
	asm("benchStencil_end:");
}

struct MyScalarCodeLoc {
	MyScalarCodeLoc() {};
	void scalarCodeInfo(ScalarCodeInfo<1, 2>& i) const
	{
		i.extent(GuardLayers<1>(1));
		i.write(0, true);
		i.write(1, false);
		i.useGuards(0, false);
		i.useGuards(1, true);
	}
	static const Loc<1> dX;
	template <class A1, class A2>
	inline void operator()(const A1& a, const A2& b, const Loc<1>& I) const
	{
		b(I) = 0.5 * (a.read(I-dX) + a.read(I+dX));
	}
};
const Loc<1> MyScalarCodeLoc::dX = Loc<1>(1);

struct MyScalarCodeInt {
	MyScalarCodeInt() {};
	void scalarCodeInfo(ScalarCodeInfo<1, 2>& i) const
	{
		i.extent(GuardLayers<1>(1));
		i.write(0, true);
		i.write(1, false);
		i.useGuards(0, false);
		i.useGuards(1, true);
	}
	template <class A1, class A2>
	inline void operator()(const A1& a, const A2& b, const Loc<1>& I) const
	{
		int i = I.first();
		b(i) = 0.5 * (a.read(i-1) + a.read(i+1));
	}
};

template <class A1, class A2>
void benchScalarCodeLoc(const A1& a, const A2& b)
{
	asm("benchScalarCodeLoc_begin:");
	Interval<1> I = a.physicalDomain();

	ScalarCode<MyScalarCodeLoc>()(a, b);
	asm("benchScalarCodeLoc_end:");
}

template <class A1, class A2>
void benchScalarCodeInt(const A1& a, const A2& b)
{
	asm("benchScalarCodeInt_begin:");
	Interval<1> I = a.physicalDomain();

	ScalarCode<MyScalarCodeInt>()(a, b);
	asm("benchScalarCodeInt_end:");
}

void bench(int size)
{
	Interval<1> domain = Interval<1>(size);
	GridLayout<1> layout = GridLayout<1>(domain, Loc<1>(8), GuardLayers<1>(1), ReplicatedTag());
	Array<1, double, MultiPatch<GridTag, Brick> > A(layout), B(layout);
	A(A.domain()) = 1.0;
	B(domain) = 1.0;
	if (!all(B(domain) == 1.0))
		exit(1);

	double startET = Pooma::Clock::value();
	benchET(A, B);
	double endET = Pooma::Clock::value();
	if (!all(B(domain) == 1.0))
		exit(1);

	double startStencil = Pooma::Clock::value();
	benchStencil(A, B);
	double endStencil = Pooma::Clock::value();
	if (!all(B(domain) == 1.0))
		exit(1);

	double startScalarCodeInt = Pooma::Clock::value();
	benchScalarCodeInt(A, B);
	double endScalarCodeInt = Pooma::Clock::value();
	if (!all(B(domain) == 1.0))
		exit(1);

	double startScalarCodeLoc = Pooma::Clock::value();
	benchScalarCodeLoc(A, B);
	double endScalarCodeLoc = Pooma::Clock::value();
	if (!all(B(domain) == 1.0))
		exit(1);

	Inform out;
	out << "Benchmark size " << size << ":" << std::endl;
	out << "  ET:               "
	    << (endET - startET)/size << std::endl;
	out << "  Stencil:          "
	    << (endStencil - startStencil)/size << std::endl;
	out << "  ScalarCode (int): "
	    << (endScalarCodeInt - startScalarCodeInt)/size << std::endl;
	out << "  ScalarCode (Loc): "
	    << (endScalarCodeLoc - startScalarCodeLoc)/size << std::endl;
}

int main(int argc, char **argv)
{
	Pooma::initialize(argc, argv);
	Pooma::blockingExpressions(true);

	bench(32*32*32);
	bench(32*32*32*10);
	bench(32*32*32*100);

	Pooma::finalize();
	return 0;
}