// Copyright 2003 Regents of the University of California
// SETI_BOINC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2, or (at your option) any later
// version.
// SETI_BOINC is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
// You should have received a copy of the GNU General Public License along
// with SETI_BOINC; see the file COPYING. If not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
// In addition, as a special exception, the Regents of the University of
// California give permission to link the code of this program with libraries
// that provide specific optimized fast Fourier transform (FFT) functions and
// distribute a linked executable. You must obey the GNU General Public
// License in all respects for all of the code used other than the FFT library
// itself. Any modification required to support these libraries must be
// distributed in source code form. If you modify this file, you may extend
// this exception to your version of the file, but you are not obligated to
// do so. If you do not wish to do so, delete this exception statement from
// your version.
// AMD optimizations by Evandro Menezes
#ifdef USE_AMD_OPT_CODE
#include <climits>
#include <xmmintrin.h>
static const int as [4] [4] = {{0, 0, INT_MIN, INT_MIN}, // {-, -, +, +}
{0, 0, 0, INT_MIN}, // {-, +, +, +}
{0, INT_MIN, 0, INT_MIN}, // {-, +, -, +}
{INT_MIN, 0, INT_MIN, 0}}; // {+, -, +, -}
static const __m128 mmpp = *(__m128 *) (as [0] + 0),
mppp = *(__m128 *) (as [1] + 0),
mpmp = *(__m128 *) (as [2] + 0),
pmpm = *(__m128 *) (as [3] + 0);
void cftfsub(int n, float *a, float *w)
{
void cft1st(int n, float *a, float *w);
void cftmdl(int n, int l, float *a, float *w);
int j, j1, j2, j3, l;
__m128 x10, x32, y10, y32, zz;
zz = _mm_setzero_ps ();
l = 2;
if (n >= 16) {
cft1st(n, a, w);
l = 16;
while ((l << 3) <= n) {
cftmdl(n, l, a, w);
l <<= 3;
}
}
if ((l << 1) < n) {
for (j = 0; j < l; j += 2) {
j1 = j + l;
j2 = j1 + l;
j3 = j2 + l;
// x0r = a[j] + a[j1];
// x0i = a[j + 1] + a[j1 + 1];
x10 = _mm_loadl_pi (zz, (__m64 *) (a + j));
y10 = _mm_loadl_pi (zz, (__m64 *) (a + j1));
// x1r = a[j] - a[j1];
// x1i = a[j + 1] - a[j1 + 1];
x10 = _mm_movelh_ps (x10, x10);
y10 = _mm_movelh_ps (y10, y10);
y10 = _mm_xor_ps (mmpp, y10);
x10 = _mm_add_ps (x10, y10);
// x2r = a[j2] + a[j3];
// x2i = a[j2 + 1] + a[j3 + 1];
x32 = _mm_loadl_pi (zz, (__m64 *) (a + j2));
y32 = _mm_loadl_pi (zz, (__m64 *) (a + j3));
// x3r = a[j2] - a[j3];
// x3i = a[j2 + 1] - a[j3 + 1];
x32 = _mm_movelh_ps (x32, x32);
y32 = _mm_movelh_ps (y32, y32);
y32 = _mm_xor_ps (mmpp, y32);
x32 = _mm_add_ps (x32, y32);
// a[j] = x0r + x2r;
// a[j + 1] = x0i + x2i;
// a[j1] = x1r - x3i;
// a[j1 + 1] = x1i + x3r;
x32 = _mm_xor_ps (mppp, x32);
x32 = _mm_shuffle_ps (x32, x32, _MM_SHUFFLE (2, 3, 1, 0));
y10 = _mm_add_ps (x10, x32);
_mm_storel_pi ((__m64 *) (a + j), y10);
_mm_storeh_pi ((__m64 *) (a + j1), y10);
// a[j2] = x0r - x2r;
// a[j2 + 1] = x0i - x2i;
// a[j3] = x1r + x3i;
// a[j3 + 1] = x1i - x3r;
y32 = _mm_sub_ps (x10, x32);
_mm_storel_pi ((__m64 *) (a + j2), y32);
_mm_storeh_pi ((__m64 *) (a + j3), y32);
}
} else if ((l << 1) == n) {
for (j = 0; j < l; j += 2) {
j1 = j + l;
// x0r = a[j] - a[j1];
// x0i = a[j + 1] - a[j1 + 1];
x10 = _mm_loadl_pi (zz, (__m64 *) (a + j));
x32 = _mm_loadl_pi (zz, (__m64 *) (a + j1));
y10 = _mm_sub_ps (x10, x32);
// a[j] += a[j1];
// a[j + 1] += a[j1 + 1];
y32 = _mm_add_ps (x10, x32);
_mm_storel_pi ((__m64 *) (a + j), y32);
// a[j1] = x0r;
// a[j1 + 1] = x0i;
_mm_storel_pi ((__m64 *) (a + j1), y10);
}
}
}
#endif // USE_AMD_OPT_CODE
syntax highlighted by Code2HTML, v. 0.9.1