[PATCH] Use IPP for Fir<>

Thu Oct 13 10:44:41 UTC 2005

I have checked in the patch below.

It makes vsip::Fir<> use IPP's FIR support where possible.  In practice,
that means whenever block size and decimation are not relatively prime. 
(IPP produces bad output when they are.  The IPP API seems to make it 
impossible, so it amounts to an IPP documentation bug.)  Fir<> uses 
the native C++ implementation for such cases.  They are probably rare 
in real programs.

The spec says the copy constructor Fir(Fir const&) is supposed to 
be VSIP_NOTHROW, but it seems to me that to implement it safely, it 
needs to do allocation.  I declared it VSIP_THROW((std::bad_alloc)).

The no-macro method used here to adapt to IPP's version of overloading 
is similar to that in fft-core.hpp, and seems practical for general 
use.

Nathan Myers
ncm

Index: ChangeLog
===================================================================
RCS file: /home/cvs/Repository/vpp/ChangeLog,v
retrieving revision 1.292
retrieving revision 1.293
diff -u -p -r1.292 -r1.293

--- ChangeLog	12 Oct 2005 13:40:17 -0000	1.292
+++ ChangeLog	13 Oct 2005 10:23:34 -0000	1.293
@@ -1,3 +1,8 @@
+2005-10-13  Nathan Myers  <ncm at codesourcery.com>
+
+	* src/vsip/impl/signal-fir.hpp: use IPP FIR support where available.
+	* tests/fir.cpp: forgive FFT noise on big samples.
+
 2005-10-12 Jules Bergmann  <jules at codesourcery.com>
 
 	* configure.ac (--with-atlas-prefix, --with-atlas-libdir): New
Index: src/vsip/impl/signal-fir.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/signal-fir.hpp,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -p -r1.2 -r1.3
--- src/vsip/impl/signal-fir.hpp	10 Oct 2005 06:33:40 -0000	1.2
+++ src/vsip/impl/signal-fir.hpp	13 Oct 2005 10:23:34 -0000	1.3
@@ -19,6 +19,11 @@
 #include <vsip/impl/global_map.hpp>
 #include <vsip/impl/profile.hpp>
 
+#if VSIP_IMPL_HAVE_IPP
+#include <vsip/impl/ipp.hpp>
+#include <ipps.h>
+#endif
+
 #include <new>
 
 namespace vsip
@@ -43,6 +48,52 @@ struct Fir_aligned
     block_type;
 };
 
+#if VSIP_IMPL_HAVE_IPP
+
+template
+<
+  typename T, typename IppT,
+  IppStatus (&ippFirF)(IppT const*,IppT*,int,IppT const*,int,IppT*,int*),
+  IppStatus (&ippFirDecF)(
+    IppT const*,IppT*,int,IppT const*,int,int,int,int,int,IppT*)
+>
+struct Ipp_fir_driver_base
+{
+  static void
+  run_fir(
+    T const* xin, T* xout, vsip::length_type outsize,
+    T const* xkernel, vsip::length_type ksize,  
+    T* xstate, vsip::length_type* xstate_ix, vsip::length_type dec)
+  {
+    IppT const* const  in = reinterpret_cast<IppT const*>(xin);
+    IppT* const  out = reinterpret_cast<IppT*>(xout);
+    IppT const* const  kernel = reinterpret_cast<IppT const*>(xkernel);
+    IppT* const  state = reinterpret_cast<IppT*>(xstate);
+    int state_ix = *xstate_ix;
+    IppStatus stat = (dec == 1) ?
+      ippFirF(in, out, outsize, kernel, ksize, state, &state_ix) :
+      ippFirDecF(in, out, outsize, kernel, ksize, 1, 0, dec, 0, state);
+    assert(stat == ippStsNoErr);
+    *xstate_ix = state_ix;
+  }
+};
+
+template<typename T> struct Ipp_fir_driver;
+
+template < > struct Ipp_fir_driver<float> : Ipp_fir_driver_base<
+  float,Ipp32f,ippsFIR_Direct_32f,ippsFIRMR_Direct_32f> { };
+
+template<> struct Ipp_fir_driver<double> : Ipp_fir_driver_base<
+  double,Ipp64f,ippsFIR_Direct_64f,ippsFIRMR_Direct_64f> {};
+
+template<> struct Ipp_fir_driver<std::complex<float> > : Ipp_fir_driver_base<
+  std::complex<float>,Ipp32fc,ippsFIR_Direct_32fc,ippsFIRMR_Direct_32fc> {};
+
+template<> struct Ipp_fir_driver<std::complex<double> > : Ipp_fir_driver_base<
+  std::complex<double>,Ipp64fc,ippsFIR_Direct_64fc,ippsFIRMR_Direct_64fc> {};
+
+#endif // VSIP_IMPL_HAVE_IPP
+
 } // namespace impl
 
 ///////////////////////////////////////////////////////////////////
@@ -63,22 +114,27 @@ class Fir
 public:
   static const vsip::symmetry_type  symmetry = symV;
   static const vsip::obj_state  continuous_filter = useOldState;
-
+  typedef typename impl::Fir_aligned<T>::block_type  block_type;
+ 
   template <typename BlockT>
   Fir(
     vsip::const_Vector<T,BlockT> kernel,
     vsip::length_type  input_size,
     vsip::length_type  decimation = 1)
         VSIP_THROW((std::bad_alloc))
-  : input_size_(input_size),
-    order_(kernel.size() * (1 + (symV != vsip::nonsym)) - 
-      (symV == vsip::sym_even_len_odd) - 1),
-    decimation_(decimation),
-    skip_(0),
-    state_saved_(0),
-    op_calls_(0),
-    state_(this->order_, T(0)),
-    kernel_(this->order_ + 1)
+  : input_size_(input_size)
+  , order_(kernel.size() * (1 + (symV != vsip::nonsym)) - 
+             (symV == vsip::sym_even_len_odd) - 1)
+  , decimation_(decimation)
+  , skip_(0)
+  , op_calls_(0)
+  , kernel_(this->order_ + 1)
+  , state_(2 * (this->order_ + 1), T(0))   // IPP wants 2x.
+  , state_saved_(0)
+#if VSIP_IMPL_HAVE_IPP
+  , temp_in_(this->input_size_)
+  , temp_out_(this->input_size_)
+#endif
   {
     assert(input_size > 0);
     assert(this->order_ + 1 > 1);  // counter unsigned wraparound
@@ -89,14 +145,58 @@ public:
     // must be after asserts because of division
     this->output_size_ = (input_size + decimation - 1) / decimation;
 
-    // mirror the kernel
-    unsigned const ksz = kernel.size();
-    this->kernel_(vsip::Domain<1>(this->order_, -1, ksz)) = kernel;
-    // and maybe unmirror a copy, too
-    if (symV != vsip::nonsym)
-      this->kernel_(vsip::Domain<1>(ksz)) = kernel;
+#if VSIP_IMPL_HAVE_IPP
+    // use IPP only if decimation is a factor of input size.
+    if (this->output_size_ * decimation == this->input_size_)
+    {
+      // IPP doesn't want it reversed.
+      this->kernel_(vsip::Domain<1>(kernel.size())) = kernel;
+      if (symV != vsip::nonsym)
+	this->kernel_(vsip::Domain<1>(
+	   this->kernel_.size() - 1, -1, kernel.size())) = kernel;
+    }
+    else
+#endif
+    {
+      // mirror the kernel
+      unsigned const ksz = kernel.size();
+      this->kernel_(vsip::Domain<1>(this->order_, -1, ksz)) = kernel;
+      // and maybe unmirror a copy, too
+      if (symV != vsip::nonsym)
+	this->kernel_(vsip::Domain<1>(ksz)) = kernel;
+    }
+  }
+
+  // FIXME: spec says this should be nothrow, but it has to allocate
+  Fir(Fir const& fir)  VSIP_THROW((std::bad_alloc))
+  : input_size_(fir.input_size_)
+  , order_(fir.order_)
+  , decimation_(fir.decimation_)
+  , skip_(fir.skip_)
+  , op_calls_(0)
+  , kernel_(fir.kernel_)
+  , state_(fir.state_(vsip::Domain<1>(fir.state_.size())))  // actual copy
+  , state_saved_(fir.state_saved_)
+#if VSIP_IMPL_HAVE_IPP
+  , temp_in_(this->input_size_)   // allocate
+  , temp_out_(this->input_size_)  // allocate
+#endif
+  { }
+
+  Fir& operator=(Fir const& fir) VSIP_NOTHROW
+  {
+    assert(this->input_size_ == fir.input_size_);
+    assert(this->order_ == fir.order_);
+    assert(this->decimation_ = fir.decimation_);
+    this->skip_ = fir.skip_;
+    this->op_calls_ = 0;
+    this->kernel_ = fir.kernel_;
+    this->state_ = fir.state_;
+    this->state_saved_ = fir.state_saved_;
   }
 
+  ~Fir()  VSIP_NOTHROW {}
+
   vsip::length_type    kernel_size() const   VSIP_NOTHROW
     { return this->order_ + 1; }
   vsip::length_type    filter_order() const   VSIP_NOTHROW
@@ -119,6 +219,7 @@ public:
     vsip::Vector<T, Block1>  out)
       VSIP_NOTHROW
   {
+    ++ this->op_calls_;
     assert(data.size() == this->input_size_);
     assert(out.size() == this->output_size_);
 
@@ -131,52 +232,78 @@ public:
     const len_type  saved = this->state_saved_;
     len_type  oix = 0;
     len_type i = 0;
-    for (; i < m - skip; ++oix, i += dec)
-    {
-      // Conceptually this comes second, but it's more convenient
-      // to put it here.  So, read the second statement below first.
-      T sum = vsip::dot(
-        this->kernel_(d_type(m - skip - i, 1, i + skip + 1)),
-        data(d_type(i + skip + 1)));
-
-      if (useOldState == vsip::state_save && i < saved)
-      {
-        sum += vsip::dot(
-          this->kernel_(d_type(saved - i)),
-          this->state_(d_type(i, 1, saved - i)));
-      }
 
-      out.put(oix, sum);
-    }
-        
-    const len_type  isz = this->input_size_;
-    len_type start = i - (m - skip);
-    for ( ; start + m < isz; ++oix, start += dec)
+#if VSIP_IMPL_HAVE_IPP
+    
+    // use IPP only if decimation is a factor of input size.
+    if (this->input_size_ == this->output_size_ * dec)
     {
-      T sum = vsip::dot(this->kernel_, data(d_type(start, 1, m + 1)));
-      out.put(oix, sum);
+      typedef impl::Layout<1,vsip::tuple<0,1,2>,
+	  impl::Stride_unit,impl::Cmplx_inter_fmt>  layout_type;
+      impl::Ext_data<Block0,layout_type>  raw_in(
+	  data.block(), impl::SYNC_IN,
+	  impl::Ext_data<block_type>(this->temp_in_.block()).data());
+      impl::Ext_data<Block1,layout_type>  raw_out(
+	  out.block(), impl::SYNC_OUT,
+	  impl::Ext_data<block_type>(this->temp_out_.block()).data());
+      impl::Ext_data<block_type,layout_type>  raw_kernel(this->kernel_.block());
+      impl::Ext_data<block_type,layout_type>  raw_state(this->state_.block());
+      oix = (this->input_size_ - skip + dec - 1) / dec;
+      
+      impl::Ipp_fir_driver<T>::run_fir(raw_in.data(), raw_out.data(), oix,
+	raw_kernel.data(), m + 1, raw_state.data(), &this->state_saved_, dec);
+
+      if (useOldState != state_save)
+	this->reset();
     }
+    else
+
+#endif
 
-    if (useOldState == state_save)
     {
-      this->skip_ = start + m - isz;
-      const len_type new_save = isz - start;
-      this->state_saved_ = new_save;
-      this->state_(d_type(new_save)) = data(d_type(start, 1, new_save));
+      for (; i < m - skip; ++oix, i += dec)
+      {
+	// Conceptually this comes second, but it's more convenient
+	// to put it here.  So, read the second statement below first.
+	T sum = vsip::dot(
+	  this->kernel_(d_type(m - skip - i, 1, i + skip + 1)),
+	  data(d_type(i + skip + 1)));
+
+	if (useOldState == vsip::state_save && i < saved)
+	{
+	  sum += vsip::dot(
+	    this->kernel_(d_type(saved - i)),
+	    this->state_(d_type(i, 1, saved - i)));
+	}
+
+	out.put(oix, sum);
+      }
+	  
+      const len_type  isz = this->input_size_;
+      len_type start = i - (m - skip);
+      for ( ; start + m < isz; ++oix, start += dec)
+      {
+	T sum = vsip::dot(this->kernel_, data(d_type(start, 1, m + 1)));
+	out.put(oix, sum);
+      }
+
+      if (useOldState == state_save)
+      {
+	this->skip_ = start + m - isz;
+	const len_type new_save = isz - start;
+	this->state_saved_ = new_save;
+	this->state_(d_type(new_save)) = data(d_type(start, 1, new_save));
+      }
     }
-    ++ this->op_calls_;
     return oix;
   }
 
   void reset()  VSIP_NOTHROW
     { this->state_saved_ = this->skip_ = 0; }
 
-  ~Fir()
-    { }
-
 public:
 
-  float impl_performance(char* what) const
+  float impl_performance(char* what) const  VSIP_NOTHROW
   {
     if (!strcmp(what, "mflops"))
     {
@@ -199,11 +326,14 @@ private:
   vsip::length_type  order_;         // M in the spec
   vsip::length_type  decimation_;
   vsip::length_type  skip_;          // how much of next input to skip
-  vsip::length_type  state_saved_;   // number of elements saved
   unsigned long  op_calls_;
-  vsip::Vector<T,typename impl::Fir_aligned<T>::block_type>  state_;
   vsip::Vector<T,typename impl::Fir_aligned<T>::block_type>  kernel_; 
-
+  vsip::Vector<T,typename impl::Fir_aligned<T>::block_type>  state_;
+  vsip::length_type  state_saved_;   // number of elements saved
+#if VSIP_IMPL_HAVE_IPP
+  vsip::Vector<T,typename impl::Fir_aligned<T>::block_type>  temp_in_;
+  vsip::Vector<T,typename impl::Fir_aligned<T>::block_type>  temp_out_;
+#endif
   impl::profile::Acc_timer timer_;
 };
 
Index: tests/fir.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/tests/fir.cpp,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -p -r1.2 -r1.3
--- tests/fir.cpp	10 Oct 2005 06:33:40 -0000	1.2
+++ tests/fir.cpp	13 Oct 2005 10:23:34 -0000	1.3
@@ -28,6 +28,39 @@
   Definitions
 ***********************************************************************/
 
+
+template <typename T, typename BlockT>
+double
+error_db(
+  vsip::const_Vector<T,BlockT> v1,
+  vsip::const_Vector<T,BlockT> v2)
+{
+  double refmax = 0.0;
+  double maxsum = -250;
+  double sum;
+
+  vsip::Index<1> idx;
+
+  refmax = vsip::maxval(vsip::magsq(v1), idx);
+
+  for (vsip::index_type i=0; i<v1.size(); ++i)
+  {
+    double val = vsip::magsq(v1.get(i) - v2.get(i));
+
+    if (val < 1.e-20)
+      sum = -201.;
+    else
+      sum = 10.0 * log10(val/(2.0*refmax));
+
+    if (sum > maxsum)
+      maxsum = sum;
+  }
+
+  return maxsum;
+}
+
+
+
 template <typename T, vsip::symmetry_type sym>
 void
 test_fir(
@@ -70,7 +103,7 @@ test_fir(
   assert(fir2.symmetry == sym);
   assert(fir1a.continuous_filter == vsip::state_save);
   assert(fir2.continuous_filter == vsip::state_no_save);
-
+ 
   const vsip::length_type  order = (sym == vsip::nonsym) ? M :
     (sym == vsip::sym_even_len_even) ? 2 * M : (2 * M) - 1;
   assert(fir1a.kernel_size() == order);
@@ -90,6 +123,8 @@ test_fir(
       output1(vsip::Domain<1>(got, 1, (N + D - 1) / D)));
   }
 
+  // vsip::Vector<T> o1(output1.size(), T(0));
+  // o1 = convout(vsip::Domain<1>(output1.size())) - output1;
   
   vsip::length_type got1b = 0;
   vsip::length_type got2 = 0;
@@ -104,11 +139,26 @@ test_fir(
 
   vsip::Vector<T>  reference(convout(vsip::Domain<1>(got)));
   vsip::Vector<T>  result(output1(vsip::Domain<1>(got)));
+
   assert(outsize - got <= 1);
-  assert(vsip::alltrue(result == reference));
+  if (got > 256)
+  {
+    double error = error_db(result, reference);
+    assert(error < -100);
+  }
+  else
+    assert(view_equal(result, reference));
+
   assert(got1b == got2);
-  assert(vsip::alltrue(
-    output2(vsip::Domain<1>(got1b)) == output3(vsip::Domain<1>(got1b))));
+  if (got > 256)
+  {
+    double error = error_db(output2(vsip::Domain<1>(got1b)),
+                            output3(vsip::Domain<1>(got1b)));
+    assert(error < -100);
+  }
+  else
+    assert(view_equal(output2(vsip::Domain<1>(got1b)),
+                      output3(vsip::Domain<1>(got1b))));
 }
   
 int
@@ -116,22 +166,18 @@ main()
 {
   vsip::vsipl init;
 
-
   test_fir<float,vsip::nonsym>(1,2,3);
-
-  test_fir<float,vsip::nonsym>(3,23,31);
-
   test_fir<float,vsip::nonsym>(1,3,5);
   test_fir<float,vsip::nonsym>(1,3,9);
   test_fir<float,vsip::nonsym>(1,4,8);
   test_fir<float,vsip::nonsym>(1,23,31);
-  test_fir<float,vsip::nonsym>(1,32,256);
+  test_fir<float,vsip::nonsym>(1,32,1024);
 
   test_fir<float,vsip::nonsym>(2,3,5);
   test_fir<float,vsip::nonsym>(2,3,9);
   test_fir<float,vsip::nonsym>(2,4,8);
   test_fir<float,vsip::nonsym>(2,23,31);
-  test_fir<float,vsip::nonsym>(2,32,256);
+  test_fir<float,vsip::nonsym>(2,32,1024);
 
   test_fir<double,vsip::nonsym>(2,3,5);
   test_fir<double,vsip::nonsym>(2,3,9);
@@ -143,7 +189,7 @@ main()
   test_fir<std::complex<float>,vsip::nonsym>(2,3,9);
   test_fir<std::complex<float>,vsip::nonsym>(2,4,8);
   test_fir<std::complex<float>,vsip::nonsym>(2,23,31);
-  test_fir<std::complex<float>,vsip::nonsym>(2,32,256);
+  test_fir<std::complex<float>,vsip::nonsym>(2,32,1024);
 
   test_fir<std::complex<double>,vsip::nonsym>(2,3,5);
   test_fir<std::complex<double>,vsip::nonsym>(2,3,9);
@@ -155,13 +201,13 @@ main()
   test_fir<float,vsip::nonsym>(3,4,21);
   test_fir<float,vsip::nonsym>(3,9,27);
   test_fir<float,vsip::nonsym>(3,23,31);
-  test_fir<float,vsip::nonsym>(3,32,256);
+  test_fir<float,vsip::nonsym>(3,32,1024);
 
   test_fir<float,vsip::nonsym>(4,5,13);
   test_fir<float,vsip::nonsym>(4,7,31);
   test_fir<float,vsip::nonsym>(4,8,32);
   test_fir<float,vsip::nonsym>(4,23,31);
-  test_fir<float,vsip::nonsym>(4,32,256);
+  test_fir<float,vsip::nonsym>(4,32,1024);
 
   test_fir<float,vsip::sym_even_len_even>(1,1,3);
   test_fir<float,vsip::sym_even_len_even>(1,2,3);
@@ -169,52 +215,38 @@ main()
   test_fir<float,vsip::sym_even_len_even>(1,3,9);
   test_fir<float,vsip::sym_even_len_even>(1,4,8);
   test_fir<float,vsip::sym_even_len_even>(1,23,57);
-#if 0 
-  // FIXME: this exposes a bug in vsip::Convolution.
-  test_fir<float,vsip::sym_even_len_even>(1,32,256);
-#endif
+  test_fir<float,vsip::sym_even_len_even>(1,32,1024);
 
   test_fir<float,vsip::sym_even_len_even>(2,2,3);
   test_fir<float,vsip::sym_even_len_even>(2,3,5);
   test_fir<float,vsip::sym_even_len_even>(2,3,9);
   test_fir<float,vsip::sym_even_len_even>(2,4,8);
   test_fir<float,vsip::sym_even_len_even>(2,23,57);
-#if 0
-  // FIXME: likewise
-  test_fir<float,vsip::sym_even_len_even>(2,32,256);
-#endif
+  test_fir<float,vsip::sym_even_len_even>(2,32,1024);
 
   test_fir<float,vsip::sym_even_len_even>(3,3,5);
   test_fir<float,vsip::sym_even_len_even>(3,4,8);
   test_fir<float,vsip::sym_even_len_even>(3,23,57);
-#if 0
-  test_fir<float,vsip::sym_even_len_even>(3,32,256);
-#endif
+  test_fir<float,vsip::sym_even_len_even>(3,32,1024);
 
   test_fir<float,vsip::sym_even_len_odd>(1,2,3);
   test_fir<float,vsip::sym_even_len_odd>(1,3,5);
   test_fir<float,vsip::sym_even_len_odd>(1,3,9);
   test_fir<float,vsip::sym_even_len_odd>(1,4,9);
   test_fir<float,vsip::sym_even_len_odd>(1,23,57);
-#if 0
-  test_fir<float,vsip::sym_even_len_odd>(1,32,256);
-#endif
+  test_fir<float,vsip::sym_even_len_odd>(1,32,1024);
 
   test_fir<float,vsip::sym_even_len_odd>(2,2,3);
   test_fir<float,vsip::sym_even_len_odd>(2,3,5);
   test_fir<float,vsip::sym_even_len_odd>(2,3,9);
   test_fir<float,vsip::sym_even_len_odd>(2,4,10);
   test_fir<float,vsip::sym_even_len_odd>(2,23,57);
-#if 0
-  test_fir<float,vsip::sym_even_len_odd>(2,32,256);
-#endif
+  test_fir<float,vsip::sym_even_len_odd>(2,32,1024);
 
   test_fir<float,vsip::sym_even_len_odd>(3,3,5);
   test_fir<float,vsip::sym_even_len_odd>(3,4,9);
   test_fir<float,vsip::sym_even_len_odd>(3,23,55);
-#if 0
-  test_fir<float,vsip::sym_even_len_odd>(3,32,256);
-#endif
+  test_fir<float,vsip::sym_even_len_odd>(3,32,1024);
 
   return 0;
 }