blob: 1b525f2740628f90b646983eb42e95c91c4b64e6 [file] [log] [blame]
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <float.h>
#include <stdlib.h>
#include <altivec.h>
#define N 1048576
#define N2 N/2
void cfft2(unsigned int n,float x[][2],float y[][2],float w[][2], float sign);
void cffti(int n, float w[][2]);
main()
{
/*
Example of Apple Altivec coded binary radix FFT
using intrinsics from Petersen and Arbenz "Intro.
to Parallel Computing," Section 3.6
This is an expanded version of a generic work-space
FFT: steps are in-line. cfft2(n,x,y,w,sign) takes complex
n-array "x" (Fortran real,aimag,real,aimag,... order)
and writes its DFT in "y". Both input "x" and the
original contents of "y" are destroyed. Initialization
for array "w" (size n/2 complex of twiddle factors
(exp(twopi*i*k/n), for k=0..n/2-1)) is computed once
by cffti(n,w).
WPP, SAM. Math. ETHZ, 1 June, 2002
*/
int first,i,icase,it,ln2,n;
int nits=1000000;
static float seed = 331.0;
float error,fnm1,sign,z0,z1,ggl();
float *x,*y,*z,*w;
double t1,mflops;
/* allocate storage for x,y,z,w on 4-word bndr. */
x = (float *) malloc(8*N);
y = (float *) malloc(8*N);
z = (float *) malloc(8*N);
w = (float *) malloc(4*N);
n = 2;
for(ln2=1;ln2<21;ln2++){
first = 1;
for(icase=0;icase<2;icase++){
if(first){
for(i=0;i<2*n;i+=2){
z0 = ggl(&seed); /* real part of array */
z1 = ggl(&seed); /* imaginary part of array */
x[i] = z0;
z[i] = z0; /* copy of initial real data */
x[i+1] = z1;
z[i+1] = z1; /* copy of initial imag. data */
}
} else {
for(i=0;i<2*n;i+=2){
z0 = 0; /* real part of array */
z1 = 0; /* imaginary part of array */
x[i] = z0;
z[i] = z0; /* copy of initial real data */
x[i+1] = z1;
z[i+1] = z1; /* copy of initial imag. data */
}
}
/* initialize sine/cosine tables */
cffti(n,w);
/* transform forward, back */
if(first){
sign = 1.0;
cfft2(n,x,y,w,sign);
sign = -1.0;
cfft2(n,y,x,w,sign);
/* results should be same as initial multiplied by n */
fnm1 = 1.0/((float) n);
error = 0.0;
for(i=0;i<2*n;i+=2){
error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) +
(z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]);
}
error = sqrt(fnm1*error);
printf(" for n=%d, fwd/bck error=%e\n",n,error);
first = 0;
} else {
for(it=0;it<nits;it++){
sign = +1.0;
cfft2(n,x,y,w,sign);
sign = -1.0;
cfft2(n,y,x,w,sign);
}
}
}
if((ln2%4)==0) nits /= 10;
n *= 2;
}
return 0;
}
void cfft2(unsigned int n,float x[][2],float y[][2],float w[][2], float sign)
{
/*
altivec version of cfft2 from Petersen and Arbenz book, "Intro.
to Parallel Computing", Oxford Univ. Press, 2003, Section 3.6
wpp 14. Dec. 2003
*/
int jb,jc,jd,jw,k,k2,k4,lj,m,j,mj,mj2,pass,tgle;
float rp,up,wr[4] __attribute((aligned(16)));
float wu[4] __attribute((aligned(16)));
float *a,*b,*c,*d;
const vector float vminus = (vector float){-0.,0.,-0.,0.};
const vector float vzero = (vector float){0.,0.,0.,0.};
const vector unsigned char pv3201 =
(vector unsigned char){4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
vector float V0,V1,V2,V3,V4,V5,V6,V7;
vector float V8,V9,V10,V11,V12,V13,V14,V15;
if(n<=1){
y[0][0] = x[0][0];
y[0][1] = x[0][1];
return;
}
m = (int) (log((float) n)/log(1.99));
mj = 1;
mj2 = 2;
lj = n/2;
/* first pass thru data: x -> y */
for(j=0;j<lj;j++){
jb = n/2+j; jc = j*mj2; jd = jc + 1;
rp = w[j][0]; up = w[j][1];
if(sign<0.0) up = -up;
y[jd][0] = rp*(x[j][0] - x[jb][0]) - up*(x[j][1] - x[jb][1]);
y[jd][1] = up*(x[j][0] - x[jb][0]) + rp*(x[j][1] - x[jb][1]);
y[jc][0] = x[j][0] + x[jb][0];
y[jc][1] = x[j][1] + x[jb][1];
}
if(n==2) return;
/* next pass is mj = 2 */
mj = 2;
mj2 = 4;
lj = n/4;
a = (float *)&y[0][0];
b = (float *)&y[n/2][0];
c = (float *)&x[0][0];
d = (float *)&x[mj][0];
if(n==4){
c = (float *)&y[0][0];
d = (float *)&y[mj][0];
}
for(j=0;j<lj;j++){
jw = j*mj; jc = j*mj2; jd = 2*jc;
rp = w[jw][0]; up = w[jw][1];
if(sign<0.0) up = -up;
wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp;
wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up;
V6 = vec_ld(0,wr);
V7 = vec_ld(0,wu);
V7 = vec_xor(V7,vminus);
V0 = vec_ld(0,(vector float *) (a+jc));
V1 = vec_ld(0,(vector float *) (b+jc));
V2 = vec_add(V0,V1); /* a + b */
vec_st(V2,0,(vector float *) (c+jd)); /* store c */
V3 = vec_sub(V0,V1); /* a - b */
V4 = vec_perm(V3,V3,pv3201);
V0 = vec_madd(V6,V3,vzero);
V1 = vec_madd(V7,V4,vzero);
V2 = vec_add(V0,V1); /* w*(a - b) */
vec_st(V2,0,(vector float*) (d+jd)); /* store d */
}
if(n==4) return;
mj *= 2;
mj2 = 2*mj;
lj = n/mj2;
tgle = 0;
for(pass=2;pass<m-1;pass++){
if(tgle){
a = (float *)&y[0][0];
b = (float *)&y[n/2][0];
c = (float *)&x[0][0];
d = (float *)&x[mj][0];
tgle = 0;
} else {
a = (float *)&x[0][0];
b = (float *)&x[n/2][0];
c = (float *)&y[0][0];
d = (float *)&y[mj][0];
tgle = 1;
}
for(j=0; j<lj; j++){
jw = j*mj; jc = j*mj2; jd = 2*jc;
rp = w[jw][0];
up = w[jw][1];
if(sign<0.0) up = -up;
wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp;
wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up;
V6 = vec_ld(0,wr);
V7 = vec_ld(0,wu);
V7 = vec_xor(V7,vminus);
for(k=0; k<mj; k+=4){
k2 = 2*k; k4 = k2+4;
V0 = vec_ld(0,(vector float *) (a+jc+k2));
V1 = vec_ld(0,(vector float *) (b+jc+k2));
V2 = vec_add(V0,V1); /* a + b */
vec_st(V2,0,(vector float*) (c+jd+k2)); /* store c */
V3 = vec_sub(V0,V1); /* a - b */
V4 = vec_perm(V3,V3,pv3201);
V0 = vec_madd(V6,V3,vzero);
V1 = vec_madd(V7,V4,vzero);
V2 = vec_add(V0,V1); /* w*(a - b) */
vec_st(V2,0,(vector float *) (d+jd+k2)); /* store d */
V8 = vec_ld(0,(vector float *) (a+jc+k4));
V9 = vec_ld(0,(vector float *) (b+jc+k4));
V10 = vec_add(V8,V9); /* a + b */
vec_st(V10,0,(vector float *) (c+jd+k4)); /* store c */
V11 = vec_sub(V8,V9); /* a - b */
V12 = vec_perm(V11,V11,pv3201);
V8 = vec_madd(V6,V11,vzero);
V9 = vec_madd(V7,V12,vzero);
V10 = vec_add(V8,V9); /* w*(a - b) */
vec_st(V10,0,(vector float *) (d+jd+k4)); /* store d */
}
}
mj *= 2;
mj2 = 2*mj;
lj = n/mj2;
}
/* last pass thru data: in-place if previous in y */
c = (float *)&y[0][0];
d = (float *)&y[n/2][0];
if(tgle) {
a = (float *)&y[0][0];
b = (float *)&y[n/2][0];
} else {
a = (float *)&x[0][0];
b = (float *)&x[n/2][0];
}
for(k=0; k<(n/2); k+=4){
k2 = 2*k; k4 = k2+4;
V0 = vec_ld(0,(vector float *) (a+k2));
V1 = vec_ld(0,(vector float *) (b+k2));
V2 = vec_add(V0,V1); /* a + b */
vec_st(V2,0,(vector float*) (c+k2)); /* store c */
V3 = vec_sub(V0,V1); /* a - b */
vec_st(V3,0,(vector float *) (d+k2)); /* store d */
V4 = vec_ld(0,(vector float *) (a+k4));
V5 = vec_ld(0,(vector float *) (b+k4));
V6 = vec_add(V4,V5); /* a + b */
vec_st(V6,0,(vector float *) (c+k4)); /* store c */
V7 = vec_sub(V4,V5); /* a - b */
vec_st(V7,0,(vector float *) (d+k4)); /* store d */
}
}
// LLVM LOCAL begin
// Implementations of sin() and cos() may vary slightly in the accuracy of
// their results, typically only in the least significant bit. Round to make
// the results consistent across platforms.
typedef union { double d; unsigned long long ll; } dbl_ll_union;
static double LLVMsin(double d) {
dbl_ll_union u;
u.d = sin(d);
u.ll = (u.ll + 1) & ~1ULL;
return u.d;
}
static double LLVMcos(double d) {
dbl_ll_union u;
u.d = cos(d);
u.ll = (u.ll + 1) & ~1ULL;
return u.d;
}
// LLVM LOCAL end
void cffti(int n, float w[][2])
{
/* initialization routine for cfft2: computes
cos(twopi*k),sin(twopi*k) for k=0..n/2-1
- the "twiddle factors" for a binary radix FFT */
int i,n2;
float aw,arg,pi;
pi = 3.141592653589793;
n2 = n/2;
aw = 2.0*pi/((float)n);
for(i=0;i<n2;i++){
arg = aw*((float)i);
w[i][0] = LLVMcos(arg);
w[i][1] = LLVMsin(arg);
}
}
#include <math.h>
float ggl(float *ds)
{
/* generate u(0,1) distributed random numbers.
Seed ds must be saved between calls. ggl is
essentially the same as the IMSL routine RNUM.
W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ:
a modification of a fortran version from
I. Vattulainen, Tampere Univ. of Technology,
Finland, 1992 */
double t,d2=0.2147483647e10;
t = (float) *ds;
t = fmod(0.16807e5*t,d2);
*ds = (float) t;
return((float) ((t-1.0e0)/(d2-1.0e0)));
}