/*
* Copyright (c) 2003, 2006 Matteo Frigo
* Copyright (c) 2003, 2006 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
#if defined(FFTW_SINGLE) || defined(FFTW_LDOUBLE)
#error "SSE2 only works in double precision"
#endif
#define VL 1 /* SIMD vector length, in term of complex numbers */
#define ALIGNMENT 16
#define ALIGNMENTA 16
#define SIMD_VSTRIDE_OKA(x) 1
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
#define RIGHT_CPU X(have_sse2)
extern int RIGHT_CPU(void);
/* gcc compiles the following code only when __SSE2__ is defined */
#if defined(__SSE2__) || !defined(__GNUC__)
/* some versions of glibc's sys/cdefs.h define __inline to be empty,
which is wrong because emmintrin.h defines several inline
procedures */
#undef __inline
#include <emmintrin.h>
typedef __m128d V;
#define VADD _mm_add_pd
#define VSUB _mm_sub_pd
#define VMUL _mm_mul_pd
#define VXOR _mm_xor_pd
#define SHUFPD _mm_shuffle_pd
#define UNPCKL _mm_unpacklo_pd
#define UNPCKH _mm_unpackhi_pd
#define STOREH _mm_storeh_pd
#define STOREL _mm_storel_pd
#ifdef __GNUC__
#define DVK(var, val) V var = __extension__ ({ \
static const union dvec _var = { {val, val} }; \
_var.v; \
})
#define LDK(x) x
#else
#define DVK(var, val) const R var = K(val)
#define LDK(x) _mm_set1_pd(x)
#endif
union dvec {
double d[2];
V v;
};
union uvec {
unsigned u[4];
V v;
};
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ivs; /* UNUSED */
return *(const V *)x;
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ovs; /* UNUSED */
*(V *)x = v;
}
#define LD LDA
#define ST STA
#define STM2 STA
#define STN2(x, v0, v1, ovs) /* nop */
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
STOREL(x, v);
STOREH(x + ovs, v);
}
#define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
#if 0 /* not used, using STM4 */
static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
{
(void)x; /* UNUSED */
(void)v0; /* UNUSED */
(void)v1; /* UNUSED */
(void)v2; /* UNUSED */
(void)v3; /* UNUSED */
(void)ovs; /* UNUSED */
STA(x, UNPCKL(v0, v1), 0, 0);
STA(x + ovs, UNPCKH(v0, v1), 0, 0);
STA(x + 2, UNPCKL(v2, v3), 0, 0);
STA(x + 2 + ovs, UNPCKH(v2, v3), 0, 0);
}
#endif
static inline V FLIP_RI(V x)
{
return SHUFPD(x, x, 1);
}
extern const union uvec X(sse2_mp);
static inline V CHS_R(V x)
{
return VXOR(X(sse2_mp).v, x);
}
static inline V VBYI(V x)
{
x = FLIP_RI(x);
x = CHS_R(x);
return x;
}
#define VFMAI(b, c) VADD(c, VBYI(b))
#define VFNMSI(b, c) VSUB(c, VBYI(b))
static inline V VZMUL(V tx, V sr)
{
V tr = UNPCKL(tx, tx);
V ti = UNPCKH(tx, tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VADD(tr, VMUL(ti, sr));
}
static inline V VZMULJ(V tx, V sr)
{
V tr = UNPCKL(tx, tx);
V ti = UNPCKH(tx, tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VSUB(tr, VMUL(ti, sr));
}
/* twiddle storage #1: compact, slower */
#define VTW1(x) {TW_CEXP, 0, x}
#define TWVL1 1
#define VTW3(x) VTW1(x)
#define TWVL3 TWVL1
static inline V BYTW1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMUL(tx, sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMULJ(tx, sr);
}
/* twiddle storage #2: twice the space, faster (when in cache) */
#define VTW2(x) \
{TW_COS, 0, x}, {TW_COS, 0, x}, {TW_SIN, 0, -x}, {TW_SIN, 0, x}
#define TWVL2 2
static inline V BYTW2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VADD(VMUL(tr, sr), VMUL(ti, si));
}
static inline V BYTWJ2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VSUB(VMUL(tr, sr), VMUL(ti, si));
}
#define VFMA(a, b, c) VADD(c, VMUL(a, b))
#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
/* twiddle storage for split arrays */
#define VTWS(x) \
{TW_COS, 0, x}, {TW_COS, 1, x}, {TW_SIN, 0, x}, {TW_SIN, 1, x}
#define TWVLS (2 * VL)
#endif /* __SSE2__ */
syntax highlighted by Code2HTML, v. 0.9.1