[PATCH] fix IPP 2D FFT, complete FFT tests

Thu Sep 29 02:12:58 UTC 2005

I have checked in the patch below. 

It adds (nearly) exhaustive testing on Fft features, and fixes 
failures in IPP FFT support the testing reveals.  It also adds 
tests for real->complex and complex -> real Fftm.

Don't be surprised when fft.cpp takes one or two minutes to compile, 
now, and spends most of that time producing 40MB of assembly code.

Nathan Myers
ncm

Index: ChangeLog
===================================================================
RCS file: /home/cvs/Repository/vpp/ChangeLog,v
retrieving revision 1.280
retrieving revision 1.281
diff -u -p -r1.280 -r1.281

--- ChangeLog	28 Sep 2005 19:07:26 -0000	1.280
+++ ChangeLog	29 Sep 2005 02:01:09 -0000	1.281
@@ -1,3 +1,17 @@
+2005-09-28  Nathan Myers  <ncm at codesourcery.com>
+
+	* src/vsip/fft-core.hpp: Make IPP FFT work for 2D FFT. 
+	  Make unimplemented IPP driver functions report failure.
+	* src/vsip/signal-fft.hpp: Initialize scale member early enough 
+	  for IPP create_plan use.
+	* tests/fftm.cpp: Enable tests for complex->real, real->complex.
+	* tests/fft.cpp: Add comprehensive testing:
+	   (2D, 3D) x ((cx->cx fwd, inv), ((re->cx, cx->re) x (all axes))) 
+	   x (Dense/row-major, Dense/column-major, Fast_block)
+	   x (single,double) x (in-place, by_reference, by_value) 
+	   x (unscaled, arbitrary-scaled, scaled by N)
+	Tested with gcc-3.4/em64t/IPP and gcc-4.0.1/x86/FFTW3.
+
 2005-09-28  Jules Bergmann  <jules at codesourcery.com>
 
 	* src/vsip/impl/block-traits.hpp (View_block_storage):
Index: src/vsip/impl/fft-core.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/fft-core.hpp,v
retrieving revision 1.17
retrieving revision 1.18
diff -u -p -r1.17 -r1.18
--- src/vsip/impl/fft-core.hpp	28 Sep 2005 00:34:11 -0000	1.17
+++ src/vsip/impl/fft-core.hpp	29 Sep 2005 02:01:09 -0000	1.18
@@ -997,17 +997,19 @@ struct Ipp_DFT_base
   }
 
   static void
-  forward2(void* plan, void const* in, void* out, void* buffer, bool f)
+  forward2(
+    void* plan, void const* in, unsigned in_row_step, 
+    void* out, unsigned out_row_step, void* buffer, bool f)
     VSIP_NOTHROW
   {
     IppStatus result = (f ?
       (*forwardFFun2)(
-	reinterpret_cast<T const*>(in), sizeof(T),
-	reinterpret_cast<T*>(out), sizeof(T),
+	reinterpret_cast<T const*>(in), in_row_step,
+	reinterpret_cast<T*>(out), out_row_step,
 	reinterpret_cast<planFT*>(plan), reinterpret_cast<Ipp8u*>(buffer)) :
       (*forwardDFun2)(
-	reinterpret_cast<T const*>(in), sizeof(T),
-	reinterpret_cast<T*>(out), sizeof(T),
+	reinterpret_cast<T const*>(in), in_row_step,
+	reinterpret_cast<T*>(out), out_row_step,
 	reinterpret_cast<planDT*>(plan), reinterpret_cast<Ipp8u*>(buffer)));
     assert(result == ippStsNoErr);
   }
@@ -1029,17 +1031,19 @@ struct Ipp_DFT_base
   }
 
   static void
-  inverse2(void* plan, void const* in, void* out, void* buffer, bool f)
+  inverse2(
+    void* plan, void const* in, unsigned in_row_step,
+    void* out, unsigned out_row_step, void* buffer, bool f)
     VSIP_NOTHROW
   {
     IppStatus result = (f ?
       (*inverseFFun2)(
-	reinterpret_cast<T const*>(in), sizeof(T),
-	reinterpret_cast<T*>(out), sizeof(T),
+	reinterpret_cast<T const*>(in), in_row_step,
+        reinterpret_cast<T*>(out), out_row_step,
 	reinterpret_cast<planFT*>(plan), reinterpret_cast<Ipp8u*>(buffer)) :
       (*inverseDFun2)(
-	reinterpret_cast<T const*>(in), sizeof(T),
-	reinterpret_cast<T*>(out), sizeof(T),
+	reinterpret_cast<T const*>(in), in_row_step,
+	reinterpret_cast<T*>(out), out_row_step,
 	reinterpret_cast<planDT*>(plan), reinterpret_cast<Ipp8u*>(buffer)));
     assert(result == ippStsNoErr);
   }
@@ -1049,21 +1053,21 @@ struct Ipp_DFT_base
 // template Ipp_DFT_base<>.
 
 template <typename P> inline IppStatus dum(P**, int, int, IppHintAlgorithm)
-  { return ippStsNoErr; }
+  { return ippStsErr; }
 template <typename P> inline IppStatus dum(P**, int, int, int, IppHintAlgorithm)
-  { return ippStsNoErr; }
+  { return ippStsErr; }
 template <typename P> inline IppStatus dum(P**, IppiSize, int, IppHintAlgorithm)
-  { return ippStsNoErr; }
+  { return ippStsErr; }
 template <typename P> inline IppStatus dum(P*)
-  { return ippStsNoErr; }
+  { return ippStsErr; }
 template <typename P> inline IppStatus dum(P const*, int*)
-  { return ippStsNoErr; }
+  { return ippStsErr; }
 template <typename P, typename T> inline IppStatus dum(
   T const*, T*, P const*, Ipp8u*)
-  { return ippStsNoErr; }
+  { return ippStsErr; }
 template <typename P, typename T> inline IppStatus dum(
   T const*, int, T*, int, P const*, Ipp8u*)
-  { return ippStsNoErr; }
+  { return ippStsErr; }
 
 
 // Specializations of Ipp_DFT create the mappings from argument
@@ -1255,10 +1259,15 @@ create_ipp_plan(
   typedef typename Time_domain<inT,outT>::type time_domain_type;
   typedef Ipp_DFT< (Dim-doFFTM),time_domain_type>  fft_type;
 
-  self.plan_from_to_ = ((Dim - doFFTM == 1) ?
-     fft_type::create_plan(sizex, flags, self.use_fft_) :
-     fft_type::create_plan2(sizex, sizey, flags, self.use_fft_));
-
+  if (Dim - doFFTM == 1)
+    self.plan_from_to_ = fft_type::create_plan(sizex, flags, self.use_fft_);
+  else
+  {
+    self.plan_from_to_ = 
+      fft_type::create_plan2(sizex, sizey, flags, self.use_fft_);
+    self.row_step_ = sizeof(outT) * dom[0].size();
+  }
+  
   self.p_buffer_ = impl::alloc_align(
     16, fft_type::bufsize(self.plan_from_to_, self.use_fft_));
   if (self.p_buffer_ == 0)
@@ -1373,11 +1382,11 @@ from_to(
 // IPP doesn't implement 2D double FFT.  Spec allows that.
 #if ! defined(VSIP_IMPL_DOUBLE)
   if (self.is_forward_)
-    Ipp_DFT<2,std::complex<SCALAR_TYPE> >::forward(
-      self.plan_from_to_, in, out, self.p_buffer_, self.use_fft_) ;
+    Ipp_DFT<2,std::complex<SCALAR_TYPE> >::forward2(self.plan_from_to_,
+      in, self.row_step_, out, self.row_step_, self.p_buffer_, self.use_fft_) ;
   else
-    Ipp_DFT<2,std::complex<SCALAR_TYPE> >::inverse(
-      self.plan_from_to_, in, out, self.p_buffer_, self.use_fft_);
+    Ipp_DFT<2,std::complex<SCALAR_TYPE> >::inverse2(self.plan_from_to_,
+      in, self.row_step_, out, self.row_step_, self.p_buffer_, self.use_fft_);
 
   if (self.doing_scaling_)
     self.scale_ = 1.0;
@@ -1421,8 +1430,8 @@ from_to(
   VSIP_IMPL_THROW(impl::unimplemented(
 		    "IPP FFT-2D real->complex not implemented"));
 #if 0  
-  Ipp_DFT<1,SCALAR_TYPE>::forward2(
-    self.plan_from_to_, in, out, self.p_buffer_, self.use_fft_) ;
+  Ipp_DFT<1,SCALAR_TYPE>::forward2(self.plan_from_to_,
+    in, self.row_step_, out, self.row_step_, self.p_buffer_, self.use_fft_) ;
   // unpack in place
   if (self.doing_scaling_)
     self.scale_ = 1.0;
@@ -1463,8 +1472,8 @@ from_to(
 #if 0  
   // pack in place; maybe this must happen in
   //   fft_by_ref, where _in_, just copied into, is writeable.
-  Ipp_DFT<1,SCALAR_TYPE>::inverse2(
-    self.plan_from_to_, in, out, self.p_buffer_, self.use_fft_) ;
+  Ipp_DFT<1,SCALAR_TYPE>::inverse2(self.plan_from_to_,
+    in, self.row_step_, out, self.row_step_, self.p_buffer_, self.use_fft_) ;
   if (self.doing_scaling_)
     self.scale_ = 1.0;
 #endif
Index: src/vsip/impl/signal-fft.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/signal-fft.hpp,v
retrieving revision 1.27
retrieving revision 1.28
diff -u -p -r1.27 -r1.28
--- src/vsip/impl/signal-fft.hpp	28 Sep 2005 00:34:11 -0000	1.27
+++ src/vsip/impl/signal-fft.hpp	29 Sep 2005 02:01:10 -0000	1.28
@@ -80,6 +80,7 @@ struct Fft_core : impl::Ref_count<impl::
   bool doing_scaling_;  // scaling is performed in the driver.
   bool is_forward_;
   void* p_buffer_;      // temporary storage not allocated in the plan
+  unsigned row_step_;    // length in bytes of 2D row.
 # endif
 
 #endif
@@ -328,6 +329,7 @@ protected:
     , in_temp_(this->input_size_)
     , out_temp_(this->output_size_)
     {
+      core_->scale_ = scale;  // IPP needs this.
       impl::Ext_data<in_block_type>  raw_in(this->in_temp_);
       impl::Ext_data<out_block_type>  raw_out(this->out_temp_);
       this->core_->create_plan(
Index: tests/fftm.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/tests/fftm.cpp,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -p -r1.7 -r1.8
--- tests/fftm.cpp	28 Sep 2005 04:32:55 -0000	1.7
+++ tests/fftm.cpp	29 Sep 2005 02:01:10 -0000	1.8
@@ -107,6 +107,32 @@ void dft_y(
 }
 
 
+template <typename T,
+	  typename Block1,
+	  typename Block2>
+void dft_y_real(
+  vsip::Matrix<T, Block1> in,
+  vsip::Matrix<vsip::complex<T>, Block2> out)
+{
+  length_type const xsize = in.size(1);
+  length_type const ysize = in.size(0);
+  assert(in.size(0)/2 + 1 == out.size(0));
+  assert(in.size(1) == out.size(1));
+  typedef long double AccT;
+
+  AccT const phi = -2.0 * M_PI/ysize;
+
+  for (index_type v=0; v < xsize; ++v)
+    for (index_type w=0; w < ysize / 2 + 1; ++w)
+    {
+      vsip::complex<AccT> sum = vsip::complex<AccT>();
+      for (index_type k=0; k<ysize; ++k)
+	sum += vsip::complex<AccT>(in(k,v)) * sin_cos<AccT>(phi*k*w);
+      out(w,v) = vsip::complex<T>(sum);
+    }
+}
+
+
 // Error metric between two vectors.
 
 template <typename T1,
@@ -412,64 +438,47 @@ test_by_val_y(length_type N)
 }
 
 
-#if 0
 
 /// Test r->c and c->r by-value Fft.
 
 template <typename T>
 void
-test_real(const int set, const length_type N)
+test_real(const length_type N)
 {
-  typedef Fftm<T, std::complex<T>, col, 0, by_value, 1, alg_space>
+  typedef Fftm<T, std::complex<T>, col, fft_fwd, by_value, 1, alg_space>
 	f_fftm_type;
-  typedef Fftm<std::complex<T>, T, col, 0, by_value, 1, alg_space>
+  typedef Fftm<std::complex<T>, T, col, fft_inv, by_value, 1, alg_space>
 	i_fftm_type;
   const length_type N2 = N/2 + 1;
 
-  f_fftm_type f_fftm(Domain<1>(N), 1.0);
-  i_fftm_type i_fftm(Domain<1>(N), 1.0/(N));
+  f_fftm_type f_fftm(Domain<2>(Domain<1>(N),Domain<1>(5)), 1.0);
+  i_fftm_type i_fftm(Domain<2>(Domain<1>(N),Domain<1>(5)), 1.0/N);
 
-  assert(f_fftm.input_size().size() == N);
-  assert(f_fftm.output_size().size() == N2);
+  assert(f_fftm.input_size().size() == 5*N);
+  assert(f_fftm.output_size().size() == 5*N2);
 
-  assert(i_fftm.input_size().size() == N2);
-  assert(i_fftm.output_size().size() == N);
+  assert(i_fftm.input_size().size() == 5*N2);
+  assert(i_fftm.output_size().size() == 5*N);
 
   assert(f_fftm.scale() == 1.0);  // can represent exactly
   assert(i_fftm.scale() > 1.0/(N + 1) && i_fftm.scale() < 1.0/(N - 1));
   assert(f_fftm.forward() == true);
   assert(i_fftm.forward() == false);
 
-  Matrix<T> in(N, T());
-  Matrix<std::complex<T> > out(N2);
-  Matrix<std::complex<T> > ref(N2);
-  Matrix<T> inv(N);
-  Matrix<T> inv2(N);
+  Matrix<T> in(N, 5, T());
+  Matrix<std::complex<T> > out(N2, 5);
+  Matrix<std::complex<T> > ref(N2, 5);
+  Matrix<T> inv(N, 5);
 
-  setup_data(set, in, 3.0);
+  setup_data_y(in);
+  dft_y_real(in, ref);
   out = f_fftm(in);
-
-  if (set == 1)
-  {
-    setup_data(3, ref, 3.0);
-    assert(error_db(ref, out) < -100);
-  }
-  if (set == 3)
-  {
-    setup_data(1, ref, 3.0 * N);
-    assert(error_db(ref, out) < -100);
-  }
-
-  ref = out;
   inv = i_fftm(out);
 
+  assert(error_db(ref, out) < -100);
   assert(error_db(inv, in) < -100);
-
-  // make sure out has not been scribbled in during the conversion.
-  assert(error_db(ref,out) < -100);
 }
 
-#endif
 
 
 int
@@ -494,12 +503,10 @@ main()
   test_by_val_y<complex<float> >(18);
   test_by_val_y<complex<float> >(256);
 
-# if 0
   // Tests for test r->c, c->r.
   test_real<float>(128);
   test_real<float>(242);
   test_real<float>(16);
-# endif
 #endif
 
 #if defined(VSIP_IMPL_FFT_USE_DOUBLE)
@@ -519,12 +526,10 @@ main()
   test_by_val_y<complex<double> >(18);
   test_by_val_y<complex<double> >(256);
 
-# if 0
   // Tests for test r->c, c->r.
   test_real<double>(128);
   test_real<double>(242);
   test_real<double>(16);
-# endif
 #endif
 
   return 0;
Index: tests/fft.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/tests/fft.cpp,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -p -r1.7 -r1.8
--- tests/fft.cpp	28 Sep 2005 04:32:54 -0000	1.7
+++ tests/fft.cpp	29 Sep 2005 02:01:10 -0000	1.8
@@ -17,6 +17,7 @@
 #include <vsip/support.hpp>
 #include <vsip/signal.hpp>
 #include <vsip/math.hpp>
+#include <vsip/random.hpp>
 
 #include "test.hpp"
 #include "output.hpp"
@@ -134,6 +135,52 @@ error_db(
   return maxsum;
 }
 
+// Error metric between two Matrices.
+
+template <typename T1,
+	  typename T2,
+	  typename Block1,
+	  typename Block2>
+double
+error_db(
+  const_Matrix<T1, Block1> v1,
+  const_Matrix<T2, Block2> v2)
+{
+  double maxsum = -250;
+  for (unsigned i = 0; i < v1.size(0); ++i)
+  {
+    double sum = error_db(v1.row(i), v2.row(i));
+    if (sum > maxsum)
+      maxsum = sum;
+  }
+  return maxsum;
+}
+
+
+
+// Error metric between two Tensors.
+
+template <typename T1,
+	  typename T2,
+	  typename Block1,
+	  typename Block2>
+double
+error_db(
+  const_Tensor<T1, Block1> v1,
+  const_Tensor<T2, Block2> v2)
+{
+  double maxsum = -250;
+  for (unsigned i = 0; i < v1.size(0); ++i)
+  {
+    vsip::Domain<1> y(v1.size(1));
+    vsip::Domain<1> x(v1.size(2));
+    double sum = error_db(v1(i,y,x), v2(i,y,x));
+    if (sum > maxsum)
+      maxsum = sum;
+  }
+  return maxsum;
+}
+
 
 
 // Setup input data for Fft.
@@ -307,12 +354,573 @@ test_real(const int set, const length_ty
   assert(error_db(ref,out) < -100);
 }
 
+/////////////////////////////////////////////////////////////////////
+//
+// Comprehensive 2D, 3D test
+//
+
+// Elt: unsigned -> element type
+
+template <typename T, bool realV> struct Elt;
+template <typename T> struct Elt<T,true>
+{
+  typedef T in_type;
+  typedef std::complex<T> out_type;
+};
+template <typename T> struct Elt<T,false>
+{
+  typedef std::complex<T> in_type;
+  typedef std::complex<T> out_type;
+};
+
+template <unsigned Dim, typename T, unsigned L> struct Arg;
+
+template <unsigned Dim, typename T> 
+struct Arg<Dim,T,0>
+{
+  typedef typename vsip::impl::View_of_dim<Dim,T,
+    vsip::Dense<Dim,T,typename vsip::impl::Row_major<Dim>::type> >::type type;
+};
+
+template <unsigned Dim, typename T> 
+struct Arg<Dim,T,1>
+{
+  typedef typename vsip::impl::View_of_dim<Dim,T,
+    vsip::Dense<Dim,T,typename vsip::impl::Col_major<Dim>::type> >::type type;
+};
+
+template <unsigned Dim, typename T> 
+struct Arg<Dim,T,2>
+{
+  typedef typename vsip::impl::View_of_dim<Dim,T,
+    vsip::impl::Fast_block<Dim,T,
+      vsip::impl::Layout<Dim,
+        typename vsip::impl::Row_major<Dim>::type,
+        vsip::impl::Stride_unit_dense
+  > > >::type type;
+};
+
+inline unsigned 
+adjust_size(unsigned size, bool is_short, bool is_short_dim, bool no_odds)
+{ 
+  // no odd sizes along axis for real->complex
+  if ((size & 1) && no_odds && is_short_dim)
+    ++size;
+  return (is_short && is_short_dim) ? size / 2 + 1 : size;
+}
+
+template <unsigned Dim> vsip::Domain<Dim> make_dom(unsigned*, bool, int, bool);
+template <> vsip::Domain<2> make_dom<2>(
+  unsigned* d, bool is_short, int sd, bool no_odds)
+{
+  return  vsip::Domain<2>(
+    vsip::Domain<1>(adjust_size(d[1], is_short, sd == 0, no_odds)),
+    vsip::Domain<1>(adjust_size(d[2], is_short, sd == 1, no_odds)));
+} 
+template <> vsip::Domain<3> make_dom<3>(
+  unsigned* d, bool is_short, int sd, bool no_odds)
+{
+  return vsip::Domain<3>(
+    vsip::Domain<1>(adjust_size(d[0], is_short, sd == 0, no_odds)),
+    vsip::Domain<1>(adjust_size(d[1], is_short, sd == 1, no_odds)),
+    vsip::Domain<1>(adjust_size(d[2], is_short, sd == 2, no_odds)));
+} 
+
+template <typename T, typename BlockT>
+vsip::Domain<2>
+domain_of(vsip::Matrix<T,BlockT> const& src)
+{
+  return vsip::Domain<2>(vsip::Domain<1>(src.size(0)),
+                         vsip::Domain<1>(src.size(1)));
+} 
+ 
+
+template <typename T, typename BlockT>
+vsip::Domain<3>
+domain_of(vsip::Tensor<T,BlockT> const& src)
+{
+  return vsip::Domain<2>(vsip::Domain<1>(src.size(0)),
+                         vsip::Domain<1>(src.size(1)),
+                         vsip::Domain<1>(src.size(2)));
+} 
+
+//
+
+template <typename T, typename BlockT>
+vsip::Matrix<T,BlockT>
+force_copy_init(vsip::Matrix<T,BlockT> const& src)
+{ 
+  vsip::Matrix<T,BlockT> tmp(src.size(0), src.size(1));
+  tmp = src;
+  return tmp;
+}
+
+template <typename T, typename BlockT>
+vsip::Tensor<T,BlockT>
+force_copy_init(vsip::Tensor<T,BlockT> const& src)
+{ 
+  vsip::Tensor<T,BlockT> tmp(src.size(0), src.size(1), src.size(2));
+  tmp = src;
+  return tmp;
+}
+
+//
+
+template <typename T> void set_values(T& v1, T& v2)
+{ v1 = T(10); v2 = T(20); }
+
+template <typename T> void set_values(std::complex<T>& z1, std::complex<T>& z2)
+{
+  z1 = std::complex<T>(T(10), T(10));
+  z2 = std::complex<T>(T(20), T(20));
+}
+
+#if 1
+
+// 2D 
+
+template <typename BlockT, typename T>
+void fill_random(
+  vsip::Matrix<T,BlockT> in, vsip::Rand<T>& rander)
+{
+  in = (rander.randu(in.size(0), in.size(1)) * 20.0) - 10.0;
+}
+
+template <typename BlockT, typename T>
+void fill_random(
+  vsip::Matrix<std::complex<T>,BlockT> in,
+  vsip::Rand<std::complex<T> >& rander)
+{
+  in = rander.randu(in.size(0), in.size(1)) * std::complex<T>(20.0) -
+         std::complex<T>(10.0, 10.0);
+}
+
+// 3D 
+
+template <typename BlockT, typename T>
+void fill_random(
+  vsip::Tensor<T,BlockT>& in, vsip::Rand<T>& rander)
+{
+  vsip::Domain<2> sub(vsip::Domain<1>(in.size(1)),
+                      vsip::Domain<1>(in.size(2))); 
+  for (unsigned i = in.size(0); i-- > 0;)
+    fill_random(in(i, vsip::Domain<1>(in.size(1)),
+                      vsip::Domain<1>(in.size(2))), rander);
+}
+
+#else
+// debug -- keep this.
+
+// 2D 
+
+template <typename BlockT, typename T>
+void fill_random(
+  vsip::Matrix<T,BlockT> in, vsip::Rand<T>& rander)
+{
+  in = T(0);
+  in.block().put(0, 0, T(1.0));
+}
+
+// 3D 
+
+template <typename BlockT, typename T>
+void fill_random(
+  vsip::Tensor<T,BlockT>& in, vsip::Rand<T>& rander)
+{
+  in = T(0);
+  in.block().put(0, 0, 0, T(1.0));
+}
+
+#endif
+
+//////
+
+// 2D, cc
+
+template <typename T, typename inBlock, typename outBlock>
+void 
+compute_ref(
+  vsip::Matrix<std::complex<T>,inBlock> const& in,
+  vsip::Domain<2> const& in_dom, 
+  vsip::Matrix<std::complex<T>,outBlock>& ref,
+  vsip::Domain<2> const& out_dom,
+  int (& /* dum */)[1])
+{
+  vsip::Fftm<std::complex<T>,std::complex<T>,0,
+             vsip::fft_fwd,vsip::by_reference,1>  fftm_across(in_dom, 1.0);
+  fftm_across(in, ref);
+
+  vsip::Fftm<std::complex<T>,std::complex<T>,1,
+             vsip::fft_fwd,vsip::by_reference,1>  fftm_down(out_dom, 1.0);
+  fftm_down(ref);
+}
+
+// 2D, rc
+
+template <typename T, typename inBlock, typename outBlock>
+void 
+compute_ref(
+  vsip::Matrix<T,inBlock> const& in,
+  vsip::Domain<2> const& in_dom, 
+  vsip::Matrix<std::complex<T>,outBlock>& ref,
+  vsip::Domain<2> const& out_dom,
+  int (& /* dum */)[1])
+{
+  vsip::Fftm<T,std::complex<T>,1,
+    vsip::fft_fwd,vsip::by_reference,1>  fftm_across(in_dom, 1.0);
+  fftm_across(in, ref);
+
+  typedef std::complex<T> CT;
+  vsip::Fftm<CT,CT,0,
+    vsip::fft_fwd,vsip::by_reference,1>  fftm_down(out_dom, 1.0);
+  fftm_down(ref);
+}
+
+// 2D, rc
+
+template <typename T, typename inBlock, typename outBlock>
+void 
+compute_ref(
+  vsip::Matrix<T,inBlock> const& in,
+  vsip::Domain<2> const& in_dom, 
+  vsip::Matrix<std::complex<T>,outBlock>& ref,
+  vsip::Domain<2> const& out_dom,
+  int (& /* dum */)[2])
+{
+  vsip::Fftm<T,std::complex<T>,0,
+    vsip::fft_fwd,vsip::by_reference,1>  fftm_across(in_dom, 1.0);
+  fftm_across(in, ref);
+
+  typedef std::complex<T> CT;
+  vsip::Fftm<CT,CT,1,
+    vsip::fft_fwd,vsip::by_reference,1>  fftm_down(out_dom, 1.0);
+  fftm_down(ref);
+}
+
+// 3D, cc
+
+template <typename T, typename inBlock, typename outBlock>
+void 
+compute_ref(
+  vsip::Tensor<std::complex<T>,inBlock> const& in,
+  vsip::Domain<3> const& in_dom, 
+  vsip::Tensor<std::complex<T>,outBlock>& ref,
+  vsip::Domain<3> const& out_dom, 
+  int (& /* dum */)[1]) 
+{
+  typedef std::complex<T> CT;
+
+  vsip::Fft<vsip::const_Matrix,CT,CT,vsip::fft_fwd,vsip::by_reference,1>  fft_across(
+    vsip::Domain<2>(in_dom[1], in_dom[2]), 1.0);
+  for (unsigned i = in_dom[0].size(); i-- > 0; )
+    fft_across(in(i, in_dom[1], in_dom[2]),
+              ref(i, out_dom[1], out_dom[2]));
+
+  // note: axis ---v--- here is reverse of notation used otherwise.
+  vsip::Fftm<CT,CT,1,vsip::fft_fwd,vsip::by_reference,1>  fftm_down(
+    vsip::Domain<2>(in_dom[0], in_dom[1]), 1.0);
+  for (unsigned k = in_dom[2].size(); k-- > 0; )
+    fftm_down(ref(out_dom[0], out_dom[1], k));
+}
+
+// 3D, rc, shorten bottom-top
+
+template <typename T, typename inBlock, typename outBlock>
+void 
+compute_ref(
+  vsip::Tensor<T,inBlock> const& in,
+  vsip::Domain<3> const& in_dom, 
+  vsip::Tensor<std::complex<T>,outBlock>& ref,
+  vsip::Domain<3> const& out_dom,
+  int (& /* dum */)[1]) 
+{
+  typedef std::complex<T> CT;
+
+  // first, planes left-right, squeeze top-bottom
+  vsip::Fft<vsip::const_Matrix,T,CT,0,vsip::by_reference,1>   fft_across(
+    vsip::Domain<2>(in_dom[0], in_dom[1]), 1.0);
+  for (unsigned k = in_dom[2].size(); k-- > 0; )
+    fft_across(in(in_dom[0], in_dom[1], k),
+            ref(out_dom[0], out_dom[1], k));
+
+  // planes top-bottom, running left-right
+  // note: axis ---v--- here is reverse of notation used otherwise.
+  vsip::Fftm<CT,CT,0,vsip::fft_fwd,vsip::by_reference,1>   fftm_down(
+    vsip::Domain<2>(in_dom[1], in_dom[2]), 1.0);
+  for (unsigned i = out_dom[0].size(); i-- > 0; )
+    fftm_down(ref(i, out_dom[1], out_dom[2]));
+}
+
+// 3D, rc, shorten front->back
+
+template <typename T, typename inBlock, typename outBlock>
+void 
+compute_ref(
+  vsip::Tensor<T,inBlock> const& in,
+  vsip::Domain<3> const& in_dom, 
+  vsip::Tensor<std::complex<T>,outBlock>& ref,
+  vsip::Domain<3> const& out_dom, 
+  int (& /* dum */)[2]) 
+{
+  typedef std::complex<T> CT;
+
+  // planes top-bottom, squeeze front-back
+  vsip::Fft<vsip::const_Matrix,T,CT,0,vsip::by_reference,1>   fft_across(
+    vsip::Domain<2>(in_dom[1], in_dom[2]), 1.0);
+  for (unsigned i = in_dom[0].size(); i-- > 0; )
+    fft_across(in(i, in_dom[1], in_dom[2]),
+              ref(i, out_dom[1], out_dom[2]));
+
+  // planes front-back, running bottom-top
+  // note: axis ---v--- here is reverse of notation used otherwise.
+  vsip::Fftm<CT,CT,1,vsip::fft_fwd,vsip::by_reference,1>   fftm_down(
+    vsip::Domain<2>(in_dom[0], in_dom[2]), 1.0);
+  for (unsigned j = out_dom[1].size(); j-- > 0; )
+    fftm_down(ref(out_dom[0], j, out_dom[2]));
+}
+
+// 3D, rc, shorten left-right
+
+template <typename T, typename inBlock, typename outBlock>
+void 
+compute_ref(
+  vsip::Tensor<T,inBlock> const& in,
+  vsip::Domain<3> const& in_dom, 
+  vsip::Tensor<std::complex<T>,outBlock>& ref,
+  vsip::Domain<3> const& out_dom, 
+  int (& /* dum */)[3])
+{
+  typedef std::complex<T> CT;
+
+  // planes top-bottom, squeeze left-right
+  vsip::Fft<vsip::const_Matrix,T,CT,1,vsip::by_reference,1>   fft_across(
+    vsip::Domain<2>(in_dom[1], in_dom[2]), 1.0);
+  for (unsigned i = in_dom[0].size(); i-- > 0; )
+    fft_across(in(i, in_dom[1], in_dom[2]),
+              ref(i, out_dom[1], out_dom[2]));
+
+  // planes left-right, running bottom-top
+  // note: axis ---v--- here is reverse of notation used otherwise.
+  vsip::Fftm<CT,CT,1,vsip::fft_fwd,vsip::by_reference,1>   fftm_down(
+    vsip::Domain<2>(in_dom[0], in_dom[1]), 1.0);
+  for (unsigned k = out_dom[2].size(); k-- > 0; )
+    fftm_down(ref(out_dom[0], out_dom[1], k));
+}
+
+template <unsigned Dim, typename T1, typename T2,
+	  int sD, vsip::return_mechanism_type How>
+struct Test_fft;
+
+template <typename T1, typename T2, int sD, vsip::return_mechanism_type How>
+struct Test_fft<2,T1,T2,sD,How>
+{ typedef vsip::Fft<vsip::const_Matrix,T1,T2,sD,How,1,vsip::alg_time>  type; };
+
+template <typename T1, typename T2, int sD, vsip::return_mechanism_type How>
+struct Test_fft<3,T1,T2,sD,How>
+{ typedef vsip::Fft<vsip::const_Tensor,T1,T2,sD,How,1,vsip::alg_time>  type; };
+
+// check_in_place
+//
+
+// there is no in-place for real->complex
+
+template <template <typename,typename> class ViewT1,
+          template <typename,typename> class ViewT2,
+          template <typename,typename> class ViewT3,
+	  typename T, typename Block1, typename Block2, int sDf, int sDi>
+void
+check_in_place(
+  vsip::Fft<ViewT1,T,std::complex<T>,sDf,vsip::by_reference,1,vsip::alg_time>&,
+  vsip::Fft<ViewT1,std::complex<T>,T,sDi,vsip::by_reference,1,vsip::alg_time>&,
+  ViewT2<T,Block1>&, ViewT3<std::complex<T>,Block2>&, double)
+{ }
+
+template <template <typename,typename> class ViewT1,
+          template <typename,typename> class ViewT2,
+          template <typename,typename> class ViewT3,
+	  typename T, typename Block1, typename Block2>
+void
+check_in_place(
+  vsip::Fft<ViewT1,T,T,vsip::fft_fwd,vsip::by_reference,1,vsip::alg_time>&  fwd,
+  vsip::Fft<ViewT1,T,T,vsip::fft_inv,vsip::by_reference,1,vsip::alg_time>&  inv,
+  ViewT2<T,Block1> const&  in,
+  ViewT3<T,Block2> const&  ref,
+  double scalei)
+{
+  typename vsip::impl::View_of_dim<Block1::dim,T,Block1>::type  inout(
+    force_copy_init(in));
+
+  fwd(inout);
+  assert(error_db(inout, ref) < -100); 
+
+  inv(inout);
+  inout *= T(scalei);
+  assert(error_db(inout, in) < -100); 
+}
+
+// when testing matrices, will use latter two values
+
+unsigned  sizes[][3] =
+{
+  { 2, 2, 2 },
+  { 8, 8, 8 },
+  { 1, 1, 1 },
+  { 2, 2, 1 },
+  { 2, 8, 128 },
+  { 3, 5, 7 },
+  { 2, 24, 48 },
+  { 24, 1, 5 },
+};
+
+//   the generic test
+
+template <unsigned inL, unsigned outL, typename F, bool isReal,
+          unsigned Dim, int sD>
+void 
+test_fft()
+{
+  typedef typename Elt<F,isReal>::in_type in_elt_type;
+  typedef typename Elt<F,false>::out_type out_elt_type;
+
+  static const int sdf = (sD < 0) ? vsip::fft_fwd : sD;
+  static const int sdi = (sD < 0) ? vsip::fft_inv : sD;
+  typedef typename Test_fft<Dim,in_elt_type,out_elt_type,
+                    sdf,vsip::by_reference>::type         fwd_by_ref_type;
+  typedef typename Test_fft<Dim,in_elt_type,out_elt_type,
+                    sdf,vsip::by_value>::type             fwd_by_value_type;
+  typedef typename Test_fft<Dim,out_elt_type,in_elt_type,
+                    sdi,vsip::by_reference>::type         inv_by_ref_type;
+  typedef typename Test_fft<Dim,out_elt_type,in_elt_type,
+                    sdi,vsip::by_value>::type             inv_by_value_type;
+
+  typedef typename Arg<Dim,in_elt_type,inL>::type    in_type;
+  typedef typename Arg<Dim,out_elt_type,outL>::type  out_type;
+
+  for (unsigned i = 0; i < sizeof(sizes)/(sizeof(*sizes)*3); ++i)
+  {
+    vsip::Rand<in_elt_type> rander(
+      sizes[i][0] * sizes[i][1] * sizes[i][2] * Dim * (sD+5) * (isReal+1));
+
+    Domain<Dim>  in_dom(make_dom<Dim>(sizes[i], false, sD, isReal)); 
+    Domain<Dim>  out_dom(make_dom<Dim>(sizes[i], isReal, sD, isReal)); 
+
+    typedef typename in_type::block_type   in_block_type;
+    typedef typename out_type::block_type  out_block_type;
+
+    in_block_type  in_block(in_dom);
+    in_type  in(in_block);
+    fill_random(in, rander);
+    in_type  in_copy(force_copy_init(in));
+
+    out_block_type  ref1_block(out_dom);
+    out_type  ref1(ref1_block);
+    int dum[(sD < 0) ? 1 : sD + 1];
+    compute_ref(in, in_dom, ref1, out_dom, dum);
+
+    out_type  ref4(force_copy_init(ref1));
+    ref4 *= out_elt_type(0.25);
+
+    out_type  refN(force_copy_init(ref1));
+    refN /= out_elt_type(in_dom.size());
+
+    assert(error_db(in, in_copy) < -200);  // not clobbered
+
+    { fwd_by_ref_type  fft_ref1(in_dom, 1.0);
+      out_block_type  out_block(out_dom);
+      out_type  out(out_block);
+      out_type  other = fft_ref1(in, out);
+      assert(&out.block() == &other.block());
+      assert(error_db(in, in_copy) < -200);  // not clobbered
+      assert(error_db(out, ref1) < -100); 
+
+      inv_by_ref_type  inv_refN(in_dom, 1.0/in_dom.size());
+      in_block_type  in2_block(in_dom);
+      in_type  in2(in2_block);
+      inv_refN(out, in2);
+      assert(error_db(out, ref1) < -100);  // not clobbered
+      assert(error_db(in2, in) < -100); 
+
+      check_in_place(fft_ref1, inv_refN, in, ref1, 1.0);
+    }
+    { fwd_by_ref_type  fft_ref4(in_dom, 0.25);
+      out_block_type  out_block(out_dom);
+      out_type  out(out_block);
+      out_type  other = fft_ref4(in, out);
+      assert(&out.block() == &other.block());
+      assert(error_db(in, in_copy) < -200);  // not clobbered
+      assert(error_db(out, ref4) < -100); 
+
+      inv_by_ref_type  inv_ref8(in_dom, .125);
+      in_block_type  in2_block(in_dom);
+      in_type  in2(in2_block);
+      inv_ref8(out, in2);
+      assert(error_db(out, ref4) < -100);  // not clobbered
+      in2 /= in_elt_type(in_dom.size() / 32.0);
+      assert(error_db(in2, in) < -100); 
+
+      check_in_place(fft_ref4, inv_ref8, in, ref4, 32.0/in_dom.size());
+    }
+    { fwd_by_ref_type  fft_refN(in_dom, 1.0/in_dom.size());
+      out_block_type  out_block(out_dom);
+      out_type  out(out_block);
+      out_type  other = fft_refN(in, out);
+      assert(&out.block() == &other.block());
+      assert(error_db(in, in_copy) < -200);  // not clobbered
+      assert(error_db(out, refN) < -100); 
+
+      inv_by_ref_type  inv_ref1(in_dom, 1.0);
+      in_block_type  in2_block(in_dom);
+      in_type  in2(in2_block);
+      inv_ref1(out, in2);
+      assert(error_db(out, refN) < -100);  // not clobbered
+      assert(error_db(in2, in) < -100); 
+
+      check_in_place(fft_refN, inv_ref1, in, refN, 1.0);
+    }
+    
+
+    { fwd_by_value_type  fwd_val1(in_dom, 1.0);
+      out_type  out(fwd_val1(in));
+      assert(error_db(in, in_copy) < -200);  // not clobbered
+      assert(error_db(out, ref1) < -100); 
+
+      inv_by_value_type  inv_valN(in_dom, 1.0/in_dom.size());
+      in_type  in2(inv_valN(out));
+      assert(error_db(out, ref1) < -100);    // not clobbered
+      assert(error_db(in2, in) < -100); 
+    }
+    { fwd_by_value_type  fwd_val4(in_dom, 0.25);
+      out_type  out(fwd_val4(in));
+      assert(error_db(in, in_copy) < -200);  // not clobbered
+      assert(error_db(out, ref4) < -100); 
+
+      inv_by_value_type  inv_val8(in_dom, 0.125);
+      in_type  in2(inv_val8(out));
+      assert(error_db(out, ref4) < -100);    // not clobbered
+      in2 /= in_elt_type(in_dom.size() / 32.0);
+      assert(error_db(in2, in) < -100); 
+    }
+    { fwd_by_value_type  fwd_valN(in_dom, 1.0/in_dom.size());
+      out_type  out(fwd_valN(in));
+      assert(error_db(in, in_copy) < -200);  // not clobbered
+      assert(error_db(out, refN) < -100); 
+
+      inv_by_value_type  inv_val1(in_dom, 1.0);
+      in_type  in2(inv_val1(out));
+      assert(error_db(out, refN) < -100);    // not clobbered
+      assert(error_db(in2, in) < -100); 
+    }
+  }
+};
 
 int
 main()
 {
   vsipl init;
 
+//
+// First check 1D 
+//
 #if defined(VSIP_IMPL_FFT_USE_FLOAT)
 
   test_by_ref<complex<float> >(2, 64);
@@ -329,7 +937,7 @@ main()
   test_real<float>(2, 242);
   test_real<float>(3, 16);
 
-#endif
+#endif 
 
 #if defined(VSIP_IMPL_FFT_USE_DOUBLE)
 
@@ -347,6 +955,126 @@ main()
   test_real<double>(2, 242);
   test_real<double>(3, 16);
 
+#endif 
+
+
+//
+// check 2D, 3D
+//
+
+#if defined(VSIP_IMPL_FFT_USE_FLOAT)
+
+  test_fft<0,0,float,false,2,vsip::fft_fwd>();
+
+#if ! defined(VSIP_IMPL_IPP_FFT)
+  test_fft<0,0,float,false,3,vsip::fft_fwd>();
+
+  test_fft<0,0,float,true,2,1>();
+  test_fft<0,0,float,true,2,0>();
+
+  test_fft<0,0,float,true,3,2>();
+  test_fft<0,0,float,true,3,1>();
+  test_fft<0,0,float,true,3,0>();
+#endif   /* VSIP_IMPL_IPP_FFT */
+
+#endif 
+
+#if defined(VSIP_IMPL_FFT_USE_DOUBLE)
+
+#if ! defined(VSIP_IMPL_IPP_FFT)
+  test_fft<0,0,double,false,2,vsip::fft_fwd>();
+  test_fft<0,0,double,false,3,vsip::fft_fwd>();
+
+  test_fft<0,0,double,true,2,1>();
+  test_fft<0,0,double,true,2,0>();
+
+  test_fft<0,0,double,true,3,2>();
+  test_fft<0,0,double,true,3,1>();
+  test_fft<0,0,double,true,3,0>();
+#endif  /* VSIP_IMPL_IPP_FFT */
+
+#endif
+
+//
+// check with different block types
+//
+
+#if defined(VSIP_IMPL_FFT_USE_FLOAT)
+# define SCALAR float
+#elif defined(VSIP_IMPL_FFT_USE_FLOAT)
+# define SCALAR double
+#endif
+
+#if defined(SCALAR)
+
+  test_fft<0,1,SCALAR,false,2,vsip::fft_fwd>();
+  test_fft<0,2,SCALAR,false,2,vsip::fft_fwd>();
+  test_fft<1,0,SCALAR,false,2,vsip::fft_fwd>();
+  test_fft<1,1,SCALAR,false,2,vsip::fft_fwd>();
+  test_fft<1,2,SCALAR,false,2,vsip::fft_fwd>();
+  test_fft<2,0,SCALAR,false,2,vsip::fft_fwd>();
+  test_fft<2,1,SCALAR,false,2,vsip::fft_fwd>();
+  test_fft<2,2,SCALAR,false,2,vsip::fft_fwd>();
+
+#if ! defined(VSIP_IMPL_IPP_FFT)
+  test_fft<0,1,SCALAR,true,2,1>();
+  test_fft<0,1,SCALAR,true,2,0>();
+  test_fft<0,2,SCALAR,true,2,1>();
+  test_fft<0,2,SCALAR,true,2,0>();
+
+  test_fft<1,0,SCALAR,true,2,1>();
+  test_fft<1,0,SCALAR,true,2,0>();
+  test_fft<1,1,SCALAR,true,2,1>();
+  test_fft<1,1,SCALAR,true,2,0>();
+  test_fft<1,2,SCALAR,true,2,1>();
+  test_fft<1,2,SCALAR,true,2,0>();
+
+  test_fft<2,0,SCALAR,true,2,1>();
+  test_fft<2,0,SCALAR,true,2,0>();
+  test_fft<2,1,SCALAR,true,2,1>();
+  test_fft<2,1,SCALAR,true,2,0>();
+  test_fft<2,2,SCALAR,true,2,1>();
+  test_fft<2,2,SCALAR,true,2,0>();
+
+
+  test_fft<0,1,SCALAR,false,3,vsip::fft_fwd>();
+  test_fft<0,2,SCALAR,false,3,vsip::fft_fwd>();
+  test_fft<1,0,SCALAR,false,3,vsip::fft_fwd>();
+  test_fft<1,1,SCALAR,false,3,vsip::fft_fwd>();
+  test_fft<1,2,SCALAR,false,3,vsip::fft_fwd>();
+  test_fft<2,0,SCALAR,false,3,vsip::fft_fwd>();
+  test_fft<2,1,SCALAR,false,3,vsip::fft_fwd>();
+  test_fft<2,2,SCALAR,false,3,vsip::fft_fwd>();
+
+  test_fft<0,1,SCALAR,true,3,2>();
+  test_fft<0,1,SCALAR,true,3,1>();
+  test_fft<0,1,SCALAR,true,3,0>();
+  test_fft<0,2,SCALAR,true,3,2>();
+  test_fft<0,2,SCALAR,true,3,1>();
+  test_fft<0,2,SCALAR,true,3,0>();
+
+  test_fft<1,0,SCALAR,true,3,2>();
+  test_fft<1,0,SCALAR,true,3,1>();
+  test_fft<1,0,SCALAR,true,3,0>();
+  test_fft<1,1,SCALAR,true,3,2>();
+  test_fft<1,1,SCALAR,true,3,1>();
+  test_fft<1,1,SCALAR,true,3,0>();
+  test_fft<1,2,SCALAR,true,3,2>();
+  test_fft<1,2,SCALAR,true,3,1>();
+  test_fft<1,2,SCALAR,true,3,0>();
+
+  test_fft<2,0,SCALAR,true,3,2>();
+  test_fft<2,0,SCALAR,true,3,1>();
+  test_fft<2,0,SCALAR,true,3,0>();
+  test_fft<2,1,SCALAR,true,3,2>();
+  test_fft<2,1,SCALAR,true,3,1>();
+  test_fft<2,1,SCALAR,true,3,0>();
+  test_fft<2,2,SCALAR,true,3,2>();
+  test_fft<2,2,SCALAR,true,3,1>();
+  test_fft<2,2,SCALAR,true,3,0>();
+
+#endif  /* VSIP_IMPL_IPP_FFT */
+
 #endif
 
   return 0;