http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
General Purpose FFT (Fast Fourier/Cosine/Sine Transform) Package
1-dim DFT / DCT / DST

*This is a package to calculate Discrete Fourier/Cosine/Sine Transforms of
1-dimensional sequences of length 2^N. This package contains C and Fortran FFT codes.

*2-dim/3-dim DFT / DCT / DST
2,3-dimensional sequences of length 2^N.

*Benchmark
http://www.kurims.kyoto-u.ac.jp/~ooura/fftbmk.html
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b78d6bd
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,30 @@
+*.o
+*.Po
+*.a
+*.la
+*.lai
+*.lo
+*.slo
+*.Plo
+*.so
+*.dylib
+*~
+*.log
+*.swp
+*.cache/*
+
+*Makefile
+
+config.h
+config.status
+compile
+libtool
+ffts.pc
+stamp-h1
+tests/test
+
+java/android/local.properties
+java/android/gen/*
+java/android/obj/*
+java/android/bin/*
+
diff --git a/alloc.c b/alloc.c
new file mode 100644
index 0000000..7833d88
--- /dev/null
+++ b/alloc.c
@@ -0,0 +1,153 @@
+/* ---- memory allocation ---- */
+#include "alloc.h"
+
+
+#define alloc_error_check(p) { \
+    if ((p) == NULL) { \
+        fprintf(stderr, "Allocation Failure!\n"); \
+        exit(1); \
+    } \
+}
+
+
+int *alloc_1d_int(int n1)
+{
+    int *i;
+    
+    i = (int *) malloc(sizeof(int) * n1);
+    alloc_error_check(i);
+    return i;
+}
+
+
+void free_1d_int(int *i)
+{
+    free(i);
+}
+
+
+double *alloc_1d_double(int n1)
+{
+    double *d;
+    
+    d = (double *) malloc(sizeof(double) * n1);
+    alloc_error_check(d);
+    return d;
+}
+
+
+void free_1d_double(double *d)
+{
+    free(d);
+}
+
+
+int **alloc_2d_int(int n1, int n2)
+{
+    int **ii, *i;
+    int j;
+    
+    ii = (int **) malloc(sizeof(int *) * n1);
+    alloc_error_check(ii);
+    i = (int *) malloc(sizeof(int) * n1 * n2);
+    alloc_error_check(i);
+    ii[0] = i;
+    for (j = 1; j < n1; j++) {
+        ii[j] = ii[j - 1] + n2;
+    }
+    return ii;
+}
+
+
+void free_2d_int(int **ii)
+{
+    free(ii[0]);
+    free(ii);
+}
+
+
+double **alloc_2d_double(int n1, int n2)
+{
+    double **dd, *d;
+    int j;
+    
+    dd = (double **) malloc(sizeof(double *) * n1);
+    alloc_error_check(dd);
+    d = (double *) malloc(sizeof(double) * n1 * n2);
+    alloc_error_check(d);
+    dd[0] = d;
+    for (j = 1; j < n1; j++) {
+        dd[j] = dd[j - 1] + n2;
+    }
+    return dd;
+}
+
+
+void free_2d_double(double **dd)
+{
+    free(dd[0]);
+    free(dd);
+}
+
+
+int ***alloc_3d_int(int n1, int n2, int n3)
+{
+    int ***iii, **ii, *i;
+    int j;
+    
+    iii = (int ***) malloc(sizeof(int **) * n1);
+    alloc_error_check(iii);
+    ii = (int **) malloc(sizeof(int *) * n1 * n2);
+    alloc_error_check(ii);
+    iii[0] = ii;
+    for (j = 1; j < n1; j++) {
+        iii[j] = iii[j - 1] + n2;
+    }
+    i = (int *) malloc(sizeof(int) * n1 * n2 * n3);
+    alloc_error_check(i);
+    ii[0] = i;
+    for (j = 1; j < n1 * n2; j++) {
+        ii[j] = ii[j - 1] + n3;
+    }
+    return iii;
+}
+
+
+void free_3d_int(int ***iii)
+{
+    free(iii[0][0]);
+    free(iii[0]);
+    free(iii);
+}
+
+
+double ***alloc_3d_double(int n1, int n2, int n3)
+{
+    double ***ddd, **dd, *d;
+    int j;
+    
+    ddd = (double ***) malloc(sizeof(double **) * n1);
+    alloc_error_check(ddd);
+    dd = (double **) malloc(sizeof(double *) * n1 * n2);
+    alloc_error_check(dd);
+    ddd[0] = dd;
+    for (j = 1; j < n1; j++) {
+        ddd[j] = ddd[j - 1] + n2;
+    }
+    d = (double *) malloc(sizeof(double) * n1 * n2 * n3);
+    alloc_error_check(d);
+    dd[0] = d;
+    for (j = 1; j < n1 * n2; j++) {
+        dd[j] = dd[j - 1] + n3;
+    }
+    return ddd;
+}
+
+
+void free_3d_double(double ***ddd)
+{
+    free(ddd[0][0]);
+    free(ddd[0]);
+    free(ddd);
+}
+
diff --git a/alloc.h b/alloc.h
new file mode 100644
index 0000000..3467cc4
--- /dev/null
+++ b/alloc.h
@@ -0,0 +1,20 @@
+/* ---- memory allocation ---- */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+
+
+int *alloc_1d_int(int n1);
+void free_1d_int(int *i);
+double *alloc_1d_double(int n1);
+void free_1d_double(double *d);
+int **alloc_2d_int(int n1, int n2);
+void free_2d_int(int **ii);
+double **alloc_2d_double(int n1, int n2);
+void free_2d_double(double **dd);
+int ***alloc_3d_int(int n1, int n2, int n3);
+void free_3d_int(int ***iii);
+double ***alloc_3d_double(int n1, int n2, int n3);
+void free_3d_double(double ***ddd);
+
diff --git a/fft4f2d.c b/fft4f2d.c
new file mode 100644
index 0000000..354c80e
--- /dev/null
+++ b/fft4f2d.c
@@ -0,0 +1,1705 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :two
+    data length :power of 2
+    decimation  :frequency
+    radix       :4, 2, row-column
+    data        :inplace
+    table       :use
+functions
+    cdft2d: Complex Discrete Fourier Transform
+    rdft2d: Real Discrete Fourier Transform
+    ddct2d: Discrete Cosine Transform
+    ddst2d: Discrete Sine Transform
+function prototypes
+    void cdft2d(int, int, int, double **, int *, double *);
+    void rdft2d(int, int, int, double **, int *, double *);
+    void ddct2d(int, int, int, double **, double **, int *, double *);
+    void ddst2d(int, int, int, double **, double **, int *, double *);
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * 
+                            exp(2*pi*i*j1*k1/n1) * 
+                            exp(2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
+        <case2>
+            X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * 
+                            exp(-2*pi*i*j1*k1/n1) * 
+                            exp(-2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft2d(n1, 2*n2, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft2d(n1, 2*n2, -1, a, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 1, n1 = power of 2
+        2*n2   :data length (int)
+                n2 >= 1, n2 = power of 2
+        a[0...n1-1][0...2*n2-1]
+               :input/output data (double **)
+                input data
+                    a[j1][2*j2] = Re(x[j1][j2]), 
+                    a[j1][2*j2+1] = Im(x[j1][j2]), 
+                    0<=j1<n1, 0<=j2<n2
+                output data
+                    a[k1][2*k2] = Re(X[k1][k2]), 
+                    a[k1][2*k2+1] = Im(X[k1][k2]), 
+                    0<=k1<n1, 0<=k2<n2
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1, n2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1/2, n2/2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            cdft2d(n1, 2*n2, -1, a, ip, w);
+        is 
+            cdft2d(n1, 2*n2, 1, a, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= 2 * n2 - 1; j2++) {
+                    a[j1][j2] *= 1.0 / (n1 * n2);
+                }
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
+                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
+                            0<=k1<n1, 0<=k2<n2
+            I[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
+                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
+                            0<=k1<n1, 0<=k2<n2
+        <case2> IRDFT (excluding scale)
+            a[k1][k2] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
+                            (R[j1][j2] * 
+                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) + 
+                            I[j1][j2] * 
+                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)), 
+                            0<=k1<n1, 0<=k2<n2
+        (notes: R[n1-k1][n2-k2] = R[k1][k2], 
+                I[n1-k1][n2-k2] = -I[k1][k2], 
+                R[n1-k1][0] = R[k1][0], 
+                I[n1-k1][0] = -I[k1][0], 
+                R[0][n2-k2] = R[0][k2], 
+                I[0][n2-k2] = -I[0][k2], 
+                0<k1<n1, 0<k2<n2)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft2d(n1, n2, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft2d(n1, n2, -1, a, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        a[0...n1-1][0...n2-1]
+               :input/output data (double **)
+                <case1>
+                    output data
+                        a[k1][2*k2] = R[k1][k2] = R[n1-k1][n2-k2], 
+                        a[k1][2*k2+1] = I[k1][k2] = -I[n1-k1][n2-k2], 
+                            0<k1<n1, 0<k2<n2/2, 
+                        a[0][2*k2] = R[0][k2] = R[0][n2-k2], 
+                        a[0][2*k2+1] = I[0][k2] = -I[0][n2-k2], 
+                            0<k2<n2/2, 
+                        a[k1][0] = R[k1][0] = R[n1-k1][0], 
+                        a[k1][1] = I[k1][0] = -I[n1-k1][0], 
+                        a[n1-k1][1] = R[k1][n2/2] = R[n1-k1][n2/2], 
+                        a[n1-k1][0] = -I[k1][n2/2] = I[n1-k1][n2/2], 
+                            0<k1<n1/2, 
+                        a[0][0] = R[0][0], 
+                        a[0][1] = R[0][n2/2], 
+                        a[n1/2][0] = R[n1/2][0], 
+                        a[n1/2][1] = R[n1/2][n2/2]
+                <case2>
+                    input data
+                        a[j1][2*j2] = R[j1][j2] = R[n1-j1][n2-j2], 
+                        a[j1][2*j2+1] = I[j1][j2] = -I[n1-j1][n2-j2], 
+                            0<j1<n1, 0<j2<n2/2, 
+                        a[0][2*j2] = R[0][j2] = R[0][n2-j2], 
+                        a[0][2*j2+1] = I[0][j2] = -I[0][n2-j2], 
+                            0<j2<n2/2, 
+                        a[j1][0] = R[j1][0] = R[n1-j1][0], 
+                        a[j1][1] = I[j1][0] = -I[n1-j1][0], 
+                        a[n1-j1][1] = R[j1][n2/2] = R[n1-j1][n2/2], 
+                        a[n1-j1][0] = -I[j1][n2/2] = I[n1-j1][n2/2], 
+                            0<j1<n1/2, 
+                        a[0][0] = R[0][0], 
+                        a[0][1] = R[0][n2/2], 
+                        a[n1/2][0] = R[n1/2][0], 
+                        a[n1/2][1] = R[n1/2][n2/2]
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1, n2/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1/2, n2/4) + n2/4
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            rdft2d(n1, n2, 1, a, ip, w);
+        is 
+            rdft2d(n1, n2, -1, a, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    a[j1][j2] *= 2.0 / (n1 * n2);
+                }
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
+                            cos(pi*j1*(k1+1/2)/n1) * 
+                            cos(pi*j2*(k2+1/2)/n2), 
+                            0<=k1<n1, 0<=k2<n2
+        <case2> DCT
+            C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
+                            cos(pi*(j1+1/2)*k1/n1) * 
+                            cos(pi*(j2+1/2)*k2/n2), 
+                            0<=k1<n1, 0<=k2<n2
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct2d(n1, n2, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct2d(n1, n2, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        a[0...n1-1][0...n2-1]
+               :input/output data (double **)
+                output data
+                    a[k1][k2] = C[k1][k2], 0<=k1<n1, 0<=k2<n2
+        t[0...n1-1][0...n2-1]
+               :work area (double **)
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1, n2/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1/2, n2/4) + max(n1, n2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddct2d(n1, n2, -1, a, t, ip, w);
+        is 
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                a[j1][0] *= 0.5;
+            }
+            for (j2 = 0; j2 <= n2 - 1; j2++) {
+                a[0][j2] *= 0.5;
+            }
+            ddct2d(n1, n2, 1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    a[j1][j2] *= 4.0 / (n1 * n2);
+                }
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k1][k2] = sum_j1=1^n1 sum_j2=1^n2 A[j1][j2] * 
+                            sin(pi*j1*(k1+1/2)/n1) * 
+                            sin(pi*j2*(k2+1/2)/n2), 
+                            0<=k1<n1, 0<=k2<n2
+        <case2> DST
+            S[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
+                            sin(pi*(j1+1/2)*k1/n1) * 
+                            sin(pi*(j2+1/2)*k2/n2), 
+                            0<k1<=n1, 0<k2<=n2
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst2d(n1, n2, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst2d(n1, n2, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        a[0...n1-1][0...n2-1]
+               :input/output data (double **)
+                <case1>
+                    input data
+                        a[j1][j2] = A[j1][j2], 0<j1<n1, 0<j2<n2, 
+                        a[j1][0] = A[j1][n2], 0<j1<n1, 
+                        a[0][j2] = A[n1][j2], 0<j2<n2, 
+                        a[0][0] = A[n1][n2]
+                        (i.e. A[j1][j2] = a[j1 % n1][j2 % n2])
+                    output data
+                        a[k1][k2] = S[k1][k2], 0<=k1<n1, 0<=k2<n2
+                <case2>
+                    output data
+                        a[k1][k2] = S[k1][k2], 0<k1<n1, 0<k2<n2, 
+                        a[k1][0] = S[k1][n2], 0<k1<n1, 
+                        a[0][k2] = S[n1][k2], 0<k2<n2, 
+                        a[0][0] = S[n1][n2]
+                        (i.e. S[k1][k2] = a[k1 % n1][k2 % n2])
+        t[0...n1-1][0...n2-1]
+               :work area (double **)
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1, n2/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1/2, n2/4) + max(n1, n2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddst2d(n1, n2, -1, a, t, ip, w);
+        is 
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                a[j1][0] *= 0.5;
+            }
+            for (j2 = 0; j2 <= n2 - 1; j2++) {
+                a[0][j2] *= 0.5;
+            }
+            ddst2d(n1, n2, 1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    a[j1][j2] *= 4.0 / (n1 * n2);
+                }
+            }
+        .
+*/
+
+
+void cdft2d(int n1, int n2, int isgn, double **a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void bitrv2col(int n1, int n, int *ip, double **a);
+    void bitrv2row(int n, int n2, int *ip, double **a);
+    void cftbcol(int n1, int n, double **a, double *w);
+    void cftbrow(int n, int n2, double **a, double *w);
+    void cftfcol(int n1, int n, double **a, double *w);
+    void cftfrow(int n, int n2, double **a, double *w);
+    int n;
+    
+    n = n1 << 1;
+    if (n < n2) {
+        n = n2;
+    }
+    if (n > (ip[0] << 2)) {
+        makewt(n >> 2, ip, w);
+    }
+    if (n2 > 4) {
+        bitrv2col(n1, n2, ip + 2, a);
+    }
+    if (n1 > 2) {
+        bitrv2row(n1, n2, ip + 2, a);
+    }
+    if (isgn < 0) {
+        cftfcol(n1, n2, a, w);
+        cftfrow(n1, n2, a, w);
+    } else {
+        cftbcol(n1, n2, a, w);
+        cftbrow(n1, n2, a, w);
+    }
+}
+
+
+void rdft2d(int n1, int n2, int isgn, double **a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2col(int n1, int n, int *ip, double **a);
+    void bitrv2row(int n, int n2, int *ip, double **a);
+    void cftbcol(int n1, int n, double **a, double *w);
+    void cftbrow(int n, int n2, double **a, double *w);
+    void cftfcol(int n1, int n, double **a, double *w);
+    void cftfrow(int n, int n2, double **a, double *w);
+    void rftbcol(int n1, int n, double **a, int nc, double *c);
+    void rftfcol(int n1, int n, double **a, int nc, double *c);
+    int n, nw, nc, n1h, i, j;
+    double xi;
+    
+    n = n1 << 1;
+    if (n < n2) {
+        n = n2;
+    }
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n2 > (nc << 2)) {
+        nc = n2 >> 2;
+        makect(nc, ip, w + nw);
+    }
+    n1h = n1 >> 1;
+    if (isgn < 0) {
+        for (i = 1; i <= n1h - 1; i++) {
+            j = n1 - i;
+            xi = a[i][0] - a[j][0];
+            a[i][0] += a[j][0];
+            a[j][0] = xi;
+            xi = a[j][1] - a[i][1];
+            a[i][1] += a[j][1];
+            a[j][1] = xi;
+        }
+        if (n1 > 2) {
+            bitrv2row(n1, n2, ip + 2, a);
+        }
+        cftfrow(n1, n2, a, w);
+        for (i = 0; i <= n1 - 1; i++) {
+            a[i][1] = 0.5 * (a[i][0] - a[i][1]);
+            a[i][0] -= a[i][1];
+        }
+        if (n2 > 4) {
+            rftfcol(n1, n2, a, nc, w + nw);
+            bitrv2col(n1, n2, ip + 2, a);
+        }
+        cftfcol(n1, n2, a, w);
+    } else {
+        if (n2 > 4) {
+            bitrv2col(n1, n2, ip + 2, a);
+        }
+        cftbcol(n1, n2, a, w);
+        if (n2 > 4) {
+            rftbcol(n1, n2, a, nc, w + nw);
+        }
+        for (i = 0; i <= n1 - 1; i++) {
+            xi = a[i][0] - a[i][1];
+            a[i][0] += a[i][1];
+            a[i][1] = xi;
+        }
+        if (n1 > 2) {
+            bitrv2row(n1, n2, ip + 2, a);
+        }
+        cftbrow(n1, n2, a, w);
+        for (i = 1; i <= n1h - 1; i++) {
+            j = n1 - i;
+            a[j][0] = 0.5 * (a[i][0] - a[j][0]);
+            a[i][0] -= a[j][0];
+            a[j][1] = 0.5 * (a[i][1] + a[j][1]);
+            a[i][1] -= a[j][1];
+        }
+    }
+}
+
+
+void ddct2d(int n1, int n2, int isgn, double **a, double **t, 
+    int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2col(int n1, int n, int *ip, double **a);
+    void bitrv2row(int n, int n2, int *ip, double **a);
+    void cftbcol(int n1, int n, double **a, double *w);
+    void cftbrow(int n, int n2, double **a, double *w);
+    void cftfcol(int n1, int n, double **a, double *w);
+    void cftfrow(int n, int n2, double **a, double *w);
+    void rftbcol(int n1, int n, double **a, int nc, double *c);
+    void rftfcol(int n1, int n, double **a, int nc, double *c);
+    void dctbsub(int n1, int n2, double **a, int nc, double *c);
+    void dctfsub(int n1, int n2, double **a, int nc, double *c);
+    int n, nw, nc, n1h, n2h, i, ix, ic, j, jx, jc;
+    double xi;
+    
+    n = n1 << 1;
+    if (n < n2) {
+        n = n2;
+    }
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n1 > nc || n2 > nc) {
+        if (n1 > n2) {
+            nc = n1;
+        } else {
+            nc = n2;
+        }
+        makect(nc, ip, w + nw);
+    }
+    n1h = n1 >> 1;
+    n2h = n2 >> 1;
+    if (isgn >= 0) {
+        for (i = 0; i <= n1 - 1; i++) {
+            for (j = 1; j <= n2h - 1; j++) {
+                jx = j << 1;
+                t[i][jx] = a[i][j];
+                t[i][jx + 1] = a[i][n2 - j];
+            }
+        }
+        t[0][0] = a[0][0];
+        t[0][1] = a[0][n2h];
+        t[n1h][0] = a[n1h][0];
+        t[n1h][1] = a[n1h][n2h];
+        for (i = 1; i <= n1h - 1; i++) {
+            ic = n1 - i;
+            t[i][0] = a[i][0];
+            t[ic][1] = a[i][n2h];
+            t[i][1] = a[ic][0];
+            t[ic][0] = a[ic][n2h];
+        }
+        dctfsub(n1, n2, t, nc, w + nw);
+        if (n1 > 2) {
+            bitrv2row(n1, n2, ip + 2, t);
+        }
+        cftfrow(n1, n2, t, w);
+        for (i = 0; i <= n1 - 1; i++) {
+            t[i][1] = 0.5 * (t[i][0] - t[i][1]);
+            t[i][0] -= t[i][1];
+        }
+        if (n2 > 4) {
+            rftfcol(n1, n2, t, nc, w + nw);
+            bitrv2col(n1, n2, ip + 2, t);
+        }
+        cftfcol(n1, n2, t, w);
+        for (i = 0; i <= n1h - 1; i++) {
+            ix = i << 1;
+            ic = n1 - 1 - i;
+            for (j = 0; j <= n2h - 1; j++) {
+                jx = j << 1;
+                jc = n2 - 1 - j;
+                a[ix][jx] = t[i][j];
+                a[ix][jx + 1] = t[i][jc];
+                a[ix + 1][jx] = t[ic][j];
+                a[ix + 1][jx + 1] = t[ic][jc];
+            }
+        }
+    } else {
+        for (i = 0; i <= n1h - 1; i++) {
+            ix = i << 1;
+            ic = n1 - 1 - i;
+            for (j = 0; j <= n2h - 1; j++) {
+                jx = j << 1;
+                jc = n2 - 1 - j;
+                t[i][j] = a[ix][jx];
+                t[i][jc] = a[ix][jx + 1];
+                t[ic][j] = a[ix + 1][jx];
+                t[ic][jc] = a[ix + 1][jx + 1];
+            }
+        }
+        if (n2 > 4) {
+            bitrv2col(n1, n2, ip + 2, t);
+        }
+        cftbcol(n1, n2, t, w);
+        if (n2 > 4) {
+            rftbcol(n1, n2, t, nc, w + nw);
+        }
+        for (i = 0; i <= n1 - 1; i++) {
+            xi = t[i][0] - t[i][1];
+            t[i][0] += t[i][1];
+            t[i][1] = xi;
+        }
+        if (n1 > 2) {
+            bitrv2row(n1, n2, ip + 2, t);
+        }
+        cftbrow(n1, n2, t, w);
+        dctbsub(n1, n2, t, nc, w + nw);
+        for (i = 0; i <= n1 - 1; i++) {
+            for (j = 1; j <= n2h - 1; j++) {
+                jx = j << 1;
+                a[i][j] = t[i][jx];
+                a[i][n2 - j] = t[i][jx + 1];
+            }
+        }
+        a[0][0] = t[0][0];
+        a[0][n2h] = t[0][1];
+        a[n1h][0] = t[n1h][0];
+        a[n1h][n2h] = t[n1h][1];
+        for (i = 1; i <= n1h - 1; i++) {
+            ic = n1 - i;
+            a[i][0] = t[i][0];
+            a[i][n2h] = t[ic][1];
+            a[ic][0] = t[i][1];
+            a[ic][n2h] = t[ic][0];
+        }
+    }
+}
+
+
+void ddst2d(int n1, int n2, int isgn, double **a, double **t, 
+    int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2col(int n1, int n, int *ip, double **a);
+    void bitrv2row(int n, int n2, int *ip, double **a);
+    void cftbcol(int n1, int n, double **a, double *w);
+    void cftbrow(int n, int n2, double **a, double *w);
+    void cftfcol(int n1, int n, double **a, double *w);
+    void cftfrow(int n, int n2, double **a, double *w);
+    void rftbcol(int n1, int n, double **a, int nc, double *c);
+    void rftfcol(int n1, int n, double **a, int nc, double *c);
+    void dstbsub(int n1, int n2, double **a, int nc, double *c);
+    void dstfsub(int n1, int n2, double **a, int nc, double *c);
+    int n, nw, nc, n1h, n2h, i, ix, ic, j, jx, jc;
+    double xi;
+    
+    n = n1 << 1;
+    if (n < n2) {
+        n = n2;
+    }
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n1 > nc || n2 > nc) {
+        if (n1 > n2) {
+            nc = n1;
+        } else {
+            nc = n2;
+        }
+        makect(nc, ip, w + nw);
+    }
+    n1h = n1 >> 1;
+    n2h = n2 >> 1;
+    if (isgn >= 0) {
+        for (i = 0; i <= n1 - 1; i++) {
+            for (j = 1; j <= n2h - 1; j++) {
+                jx = j << 1;
+                t[i][jx] = a[i][j];
+                t[i][jx + 1] = a[i][n2 - j];
+            }
+        }
+        t[0][0] = a[0][0];
+        t[0][1] = a[0][n2h];
+        t[n1h][0] = a[n1h][0];
+        t[n1h][1] = a[n1h][n2h];
+        for (i = 1; i <= n1h - 1; i++) {
+            ic = n1 - i;
+            t[i][0] = a[i][0];
+            t[ic][1] = a[i][n2h];
+            t[i][1] = a[ic][0];
+            t[ic][0] = a[ic][n2h];
+        }
+        dstfsub(n1, n2, t, nc, w + nw);
+        if (n1 > 2) {
+            bitrv2row(n1, n2, ip + 2, t);
+        }
+        cftfrow(n1, n2, t, w);
+        for (i = 0; i <= n1 - 1; i++) {
+            t[i][1] = 0.5 * (t[i][0] - t[i][1]);
+            t[i][0] -= t[i][1];
+        }
+        if (n2 > 4) {
+            rftfcol(n1, n2, t, nc, w + nw);
+            bitrv2col(n1, n2, ip + 2, t);
+        }
+        cftfcol(n1, n2, t, w);
+        for (i = 0; i <= n1h - 1; i++) {
+            ix = i << 1;
+            ic = n1 - 1 - i;
+            for (j = 0; j <= n2h - 1; j++) {
+                jx = j << 1;
+                jc = n2 - 1 - j;
+                a[ix][jx] = t[i][j];
+                a[ix][jx + 1] = -t[i][jc];
+                a[ix + 1][jx] = -t[ic][j];
+                a[ix + 1][jx + 1] = t[ic][jc];
+            }
+        }
+    } else {
+        for (i = 0; i <= n1h - 1; i++) {
+            ix = i << 1;
+            ic = n1 - 1 - i;
+            for (j = 0; j <= n2h - 1; j++) {
+                jx = j << 1;
+                jc = n2 - 1 - j;
+                t[i][j] = a[ix][jx];
+                t[i][jc] = -a[ix][jx + 1];
+                t[ic][j] = -a[ix + 1][jx];
+                t[ic][jc] = a[ix + 1][jx + 1];
+            }
+        }
+        if (n2 > 4) {
+            bitrv2col(n1, n2, ip + 2, t);
+        }
+        cftbcol(n1, n2, t, w);
+        if (n2 > 4) {
+            rftbcol(n1, n2, t, nc, w + nw);
+        }
+        for (i = 0; i <= n1 - 1; i++) {
+            xi = t[i][0] - t[i][1];
+            t[i][0] += t[i][1];
+            t[i][1] = xi;
+        }
+        if (n1 > 2) {
+            bitrv2row(n1, n2, ip + 2, t);
+        }
+        cftbrow(n1, n2, t, w);
+        dstbsub(n1, n2, t, nc, w + nw);
+        for (i = 0; i <= n1 - 1; i++) {
+            for (j = 1; j <= n2h - 1; j++) {
+                jx = j << 1;
+                a[i][j] = t[i][jx];
+                a[i][n2 - j] = t[i][jx + 1];
+            }
+        }
+        a[0][0] = t[0][0];
+        a[0][n2h] = t[0][1];
+        a[n1h][0] = t[n1h][0];
+        a[n1h][n2h] = t[n1h][1];
+        for (i = 1; i <= n1h - 1; i++) {
+            ic = n1 - i;
+            a[i][0] = t[i][0];
+            a[i][n2h] = t[ic][1];
+            a[ic][0] = t[i][1];
+            a[ic][n2h] = t[ic][0];
+        }
+    }
+}
+
+
+/* -------- initializing routines -------- */
+
+
+#include <math.h>
+
+void makewt(int nw, int *ip, double *w)
+{
+    void bitrv2(int n, int *ip, double *a);
+    int nwh, j;
+    double delta, x, y;
+    
+    ip[0] = nw;
+    ip[1] = 1;
+    if (nw > 2) {
+        nwh = nw >> 1;
+        delta = atan(1.0) / nwh;
+        w[0] = 1;
+        w[1] = 0;
+        w[nwh] = cos(delta * nwh);
+        w[nwh + 1] = w[nwh];
+        for (j = 2; j <= nwh - 2; j += 2) {
+            x = cos(delta * j);
+            y = sin(delta * j);
+            w[j] = x;
+            w[j + 1] = y;
+            w[nw - j] = y;
+            w[nw - j + 1] = x;
+        }
+        bitrv2(nw, ip + 2, w);
+    }
+}
+
+
+void makect(int nc, int *ip, double *c)
+{
+    int nch, j;
+    double delta;
+    
+    ip[1] = nc;
+    if (nc > 1) {
+        nch = nc >> 1;
+        delta = atan(1.0) / nch;
+        c[0] = 0.5;
+        c[nch] = 0.5 * cos(delta * nch);
+        for (j = 1; j <= nch - 1; j++) {
+            c[j] = 0.5 * cos(delta * j);
+            c[nc - j] = 0.5 * sin(delta * j);
+        }
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+void bitrv2(int n, int *ip, double *a)
+{
+    int j, j1, k, k1, l, m, m2;
+    double xr, xi;
+    
+    ip[0] = 0;
+    l = n;
+    m = 1;
+    while ((m << 2) < l) {
+        l >>= 1;
+        for (j = 0; j <= m - 1; j++) {
+            ip[m + j] = ip[j] + l;
+        }
+        m <<= 1;
+    }
+    if ((m << 2) > l) {
+        for (k = 1; k <= m - 1; k++) {
+            for (j = 0; j <= k - 1; j++) {
+                j1 = (j << 1) + ip[k];
+                k1 = (k << 1) + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                a[j1] = a[k1];
+                a[j1 + 1] = a[k1 + 1];
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+        }
+    } else {
+        m2 = m << 1;
+        for (k = 1; k <= m - 1; k++) {
+            for (j = 0; j <= k - 1; j++) {
+                j1 = (j << 1) + ip[k];
+                k1 = (k << 1) + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                a[j1] = a[k1];
+                a[j1 + 1] = a[k1 + 1];
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                a[j1] = a[k1];
+                a[j1 + 1] = a[k1 + 1];
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+        }
+    }
+}
+
+
+void bitrv2col(int n1, int n, int *ip, double **a)
+{
+    int i, j, j1, k, k1, l, m, m2;
+    double xr, xi;
+    
+    ip[0] = 0;
+    l = n;
+    m = 1;
+    while ((m << 2) < l) {
+        l >>= 1;
+        for (j = 0; j <= m - 1; j++) {
+            ip[m + j] = ip[j] + l;
+        }
+        m <<= 1;
+    }
+    if ((m << 2) > l) {
+        for (i = 0; i <= n1 - 1; i++) {
+            for (k = 1; k <= m - 1; k++) {
+                for (j = 0; j <= k - 1; j++) {
+                    j1 = (j << 1) + ip[k];
+                    k1 = (k << 1) + ip[j];
+                    xr = a[i][j1];
+                    xi = a[i][j1 + 1];
+                    a[i][j1] = a[i][k1];
+                    a[i][j1 + 1] = a[i][k1 + 1];
+                    a[i][k1] = xr;
+                    a[i][k1 + 1] = xi;
+                }
+            }
+        }
+    } else {
+        m2 = m << 1;
+        for (i = 0; i <= n1 - 1; i++) {
+            for (k = 1; k <= m - 1; k++) {
+                for (j = 0; j <= k - 1; j++) {
+                    j1 = (j << 1) + ip[k];
+                    k1 = (k << 1) + ip[j];
+                    xr = a[i][j1];
+                    xi = a[i][j1 + 1];
+                    a[i][j1] = a[i][k1];
+                    a[i][j1 + 1] = a[i][k1 + 1];
+                    a[i][k1] = xr;
+                    a[i][k1 + 1] = xi;
+                    j1 += m2;
+                    k1 += m2;
+                    xr = a[i][j1];
+                    xi = a[i][j1 + 1];
+                    a[i][j1] = a[i][k1];
+                    a[i][j1 + 1] = a[i][k1 + 1];
+                    a[i][k1] = xr;
+                    a[i][k1 + 1] = xi;
+                }
+            }
+        }
+    }
+}
+
+
+void bitrv2row(int n, int n2, int *ip, double **a)
+{
+    int i, j, j1, k, k1, l, m;
+    double xr, xi;
+    
+    ip[0] = 0;
+    l = n;
+    m = 1;
+    while ((m << 1) < l) {
+        l >>= 1;
+        for (j = 0; j <= m - 1; j++) {
+            ip[m + j] = ip[j] + l;
+        }
+        m <<= 1;
+    }
+    if ((m << 1) > l) {
+        for (k = 1; k <= m - 1; k++) {
+            for (j = 0; j <= k - 1; j++) {
+                j1 = j + ip[k];
+                k1 = k + ip[j];
+                for (i = 0; i <= n2 - 2; i += 2) {
+                    xr = a[j1][i];
+                    xi = a[j1][i + 1];
+                    a[j1][i] = a[k1][i];
+                    a[j1][i + 1] = a[k1][i + 1];
+                    a[k1][i] = xr;
+                    a[k1][i + 1] = xi;
+                }
+            }
+        }
+    } else {
+        for (k = 1; k <= m - 1; k++) {
+            for (j = 0; j <= k - 1; j++) {
+                j1 = j + ip[k];
+                k1 = k + ip[j];
+                for (i = 0; i <= n2 - 2; i += 2) {
+                    xr = a[j1][i];
+                    xi = a[j1][i + 1];
+                    a[j1][i] = a[k1][i];
+                    a[j1][i + 1] = a[k1][i + 1];
+                    a[k1][i] = xr;
+                    a[k1][i + 1] = xi;
+                }
+                j1 += m;
+                k1 += m;
+                for (i = 0; i <= n2 - 2; i += 2) {
+                    xr = a[j1][i];
+                    xi = a[j1][i + 1];
+                    a[j1][i] = a[k1][i];
+                    a[j1][i + 1] = a[k1][i + 1];
+                    a[k1][i] = xr;
+                    a[k1][i + 1] = xi;
+                }
+            }
+        }
+    }
+}
+
+
+void cftbcol(int n1, int n, double **a, double *w)
+{
+    int i, j, j1, j2, j3, k, k1, ks, l, m;
+    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    for (i = 0; i <= n1 - 1; i++) {
+        l = 2;
+        while ((l << 1) < n) {
+            m = l << 2;
+            for (j = 0; j <= l - 2; j += 2) {
+                j1 = j + l;
+                j2 = j1 + l;
+                j3 = j2 + l;
+                x0r = a[i][j] + a[i][j1];
+                x0i = a[i][j + 1] + a[i][j1 + 1];
+                x1r = a[i][j] - a[i][j1];
+                x1i = a[i][j + 1] - a[i][j1 + 1];
+                x2r = a[i][j2] + a[i][j3];
+                x2i = a[i][j2 + 1] + a[i][j3 + 1];
+                x3r = a[i][j2] - a[i][j3];
+                x3i = a[i][j2 + 1] - a[i][j3 + 1];
+                a[i][j] = x0r + x2r;
+                a[i][j + 1] = x0i + x2i;
+                a[i][j2] = x0r - x2r;
+                a[i][j2 + 1] = x0i - x2i;
+                a[i][j1] = x1r - x3i;
+                a[i][j1 + 1] = x1i + x3r;
+                a[i][j3] = x1r + x3i;
+                a[i][j3 + 1] = x1i - x3r;
+            }
+            if (m < n) {
+                wk1r = w[2];
+                for (j = m; j <= l + m - 2; j += 2) {
+                    j1 = j + l;
+                    j2 = j1 + l;
+                    j3 = j2 + l;
+                    x0r = a[i][j] + a[i][j1];
+                    x0i = a[i][j + 1] + a[i][j1 + 1];
+                    x1r = a[i][j] - a[i][j1];
+                    x1i = a[i][j + 1] - a[i][j1 + 1];
+                    x2r = a[i][j2] + a[i][j3];
+                    x2i = a[i][j2 + 1] + a[i][j3 + 1];
+                    x3r = a[i][j2] - a[i][j3];
+                    x3i = a[i][j2 + 1] - a[i][j3 + 1];
+                    a[i][j] = x0r + x2r;
+                    a[i][j + 1] = x0i + x2i;
+                    a[i][j2] = x2i - x0i;
+                    a[i][j2 + 1] = x0r - x2r;
+                    x0r = x1r - x3i;
+                    x0i = x1i + x3r;
+                    a[i][j1] = wk1r * (x0r - x0i);
+                    a[i][j1 + 1] = wk1r * (x0r + x0i);
+                    x0r = x3i + x1r;
+                    x0i = x3r - x1i;
+                    a[i][j3] = wk1r * (x0i - x0r);
+                    a[i][j3 + 1] = wk1r * (x0i + x0r);
+                }
+                k1 = 1;
+                ks = -1;
+                for (k = (m << 1); k <= n - m; k += m) {
+                    k1++;
+                    ks = -ks;
+                    wk1r = w[k1 << 1];
+                    wk1i = w[(k1 << 1) + 1];
+                    wk2r = ks * w[k1];
+                    wk2i = w[k1 + ks];
+                    wk3r = wk1r - 2 * wk2i * wk1i;
+                    wk3i = 2 * wk2i * wk1r - wk1i;
+                    for (j = k; j <= l + k - 2; j += 2) {
+                        j1 = j + l;
+                        j2 = j1 + l;
+                        j3 = j2 + l;
+                        x0r = a[i][j] + a[i][j1];
+                        x0i = a[i][j + 1] + a[i][j1 + 1];
+                        x1r = a[i][j] - a[i][j1];
+                        x1i = a[i][j + 1] - a[i][j1 + 1];
+                        x2r = a[i][j2] + a[i][j3];
+                        x2i = a[i][j2 + 1] + a[i][j3 + 1];
+                        x3r = a[i][j2] - a[i][j3];
+                        x3i = a[i][j2 + 1] - a[i][j3 + 1];
+                        a[i][j] = x0r + x2r;
+                        a[i][j + 1] = x0i + x2i;
+                        x0r -= x2r;
+                        x0i -= x2i;
+                        a[i][j2] = wk2r * x0r - wk2i * x0i;
+                        a[i][j2 + 1] = wk2r * x0i + wk2i * x0r;
+                        x0r = x1r - x3i;
+                        x0i = x1i + x3r;
+                        a[i][j1] = wk1r * x0r - wk1i * x0i;
+                        a[i][j1 + 1] = wk1r * x0i + wk1i * x0r;
+                        x0r = x1r + x3i;
+                        x0i = x1i - x3r;
+                        a[i][j3] = wk3r * x0r - wk3i * x0i;
+                        a[i][j3 + 1] = wk3r * x0i + wk3i * x0r;
+                    }
+                }
+            }
+            l = m;
+        }
+        if (l < n) {
+            for (j = 0; j <= l - 2; j += 2) {
+                j1 = j + l;
+                x0r = a[i][j] - a[i][j1];
+                x0i = a[i][j + 1] - a[i][j1 + 1];
+                a[i][j] += a[i][j1];
+                a[i][j + 1] += a[i][j1 + 1];
+                a[i][j1] = x0r;
+                a[i][j1 + 1] = x0i;
+            }
+        }
+    }
+}
+
+
+void cftbrow(int n, int n2, double **a, double *w)
+{
+    int i, j, j1, j2, j3, k, k1, ks, l, m;
+    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    l = 1;
+    while ((l << 1) < n) {
+        m = l << 2;
+        for (j = 0; j <= l - 1; j++) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            for (i = 0; i <= n2 - 2; i += 2) {
+                x0r = a[j][i] + a[j1][i];
+                x0i = a[j][i + 1] + a[j1][i + 1];
+                x1r = a[j][i] - a[j1][i];
+                x1i = a[j][i + 1] - a[j1][i + 1];
+                x2r = a[j2][i] + a[j3][i];
+                x2i = a[j2][i + 1] + a[j3][i + 1];
+                x3r = a[j2][i] - a[j3][i];
+                x3i = a[j2][i + 1] - a[j3][i + 1];
+                a[j][i] = x0r + x2r;
+                a[j][i + 1] = x0i + x2i;
+                a[j2][i] = x0r - x2r;
+                a[j2][i + 1] = x0i - x2i;
+                a[j1][i] = x1r - x3i;
+                a[j1][i + 1] = x1i + x3r;
+                a[j3][i] = x1r + x3i;
+                a[j3][i + 1] = x1i - x3r;
+            }
+        }
+        if (m < n) {
+            wk1r = w[2];
+            for (j = m; j <= l + m - 1; j++) {
+                j1 = j + l;
+                j2 = j1 + l;
+                j3 = j2 + l;
+                for (i = 0; i <= n2 - 2; i += 2) {
+                    x0r = a[j][i] + a[j1][i];
+                    x0i = a[j][i + 1] + a[j1][i + 1];
+                    x1r = a[j][i] - a[j1][i];
+                    x1i = a[j][i + 1] - a[j1][i + 1];
+                    x2r = a[j2][i] + a[j3][i];
+                    x2i = a[j2][i + 1] + a[j3][i + 1];
+                    x3r = a[j2][i] - a[j3][i];
+                    x3i = a[j2][i + 1] - a[j3][i + 1];
+                    a[j][i] = x0r + x2r;
+                    a[j][i + 1] = x0i + x2i;
+                    a[j2][i] = x2i - x0i;
+                    a[j2][i + 1] = x0r - x2r;
+                    x0r = x1r - x3i;
+                    x0i = x1i + x3r;
+                    a[j1][i] = wk1r * (x0r - x0i);
+                    a[j1][i + 1] = wk1r * (x0r + x0i);
+                    x0r = x3i + x1r;
+                    x0i = x3r - x1i;
+                    a[j3][i] = wk1r * (x0i - x0r);
+                    a[j3][i + 1] = wk1r * (x0i + x0r);
+                }
+            }
+            k1 = 1;
+            ks = -1;
+            for (k = (m << 1); k <= n - m; k += m) {
+                k1++;
+                ks = -ks;
+                wk1r = w[k1 << 1];
+                wk1i = w[(k1 << 1) + 1];
+                wk2r = ks * w[k1];
+                wk2i = w[k1 + ks];
+                wk3r = wk1r - 2 * wk2i * wk1i;
+                wk3i = 2 * wk2i * wk1r - wk1i;
+                for (j = k; j <= l + k - 1; j++) {
+                    j1 = j + l;
+                    j2 = j1 + l;
+                    j3 = j2 + l;
+                    for (i = 0; i <= n2 - 2; i += 2) {
+                        x0r = a[j][i] + a[j1][i];
+                        x0i = a[j][i + 1] + a[j1][i + 1];
+                        x1r = a[j][i] - a[j1][i];
+                        x1i = a[j][i + 1] - a[j1][i + 1];
+                        x2r = a[j2][i] + a[j3][i];
+                        x2i = a[j2][i + 1] + a[j3][i + 1];
+                        x3r = a[j2][i] - a[j3][i];
+                        x3i = a[j2][i + 1] - a[j3][i + 1];
+                        a[j][i] = x0r + x2r;
+                        a[j][i + 1] = x0i + x2i;
+                        x0r -= x2r;
+                        x0i -= x2i;
+                        a[j2][i] = wk2r * x0r - wk2i * x0i;
+                        a[j2][i + 1] = wk2r * x0i + wk2i * x0r;
+                        x0r = x1r - x3i;
+                        x0i = x1i + x3r;
+                        a[j1][i] = wk1r * x0r - wk1i * x0i;
+                        a[j1][i + 1] = wk1r * x0i + wk1i * x0r;
+                        x0r = x1r + x3i;
+                        x0i = x1i - x3r;
+                        a[j3][i] = wk3r * x0r - wk3i * x0i;
+                        a[j3][i + 1] = wk3r * x0i + wk3i * x0r;
+                    }
+                }
+            }
+        }
+        l = m;
+    }
+    if (l < n) {
+        for (j = 0; j <= l - 1; j++) {
+            j1 = j + l;
+            for (i = 0; i <= n2 - 2; i += 2) {
+                x0r = a[j][i] - a[j1][i];
+                x0i = a[j][i + 1] - a[j1][i + 1];
+                a[j][i] += a[j1][i];
+                a[j][i + 1] += a[j1][i + 1];
+                a[j1][i] = x0r;
+                a[j1][i + 1] = x0i;
+            }
+        }
+    }
+}
+
+
+void cftfcol(int n1, int n, double **a, double *w)
+{
+    int i, j, j1, j2, j3, k, k1, ks, l, m;
+    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    for (i = 0; i <= n1 - 1; i++) {
+        l = 2;
+        while ((l << 1) < n) {
+            m = l << 2;
+            for (j = 0; j <= l - 2; j += 2) {
+                j1 = j + l;
+                j2 = j1 + l;
+                j3 = j2 + l;
+                x0r = a[i][j] + a[i][j1];
+                x0i = a[i][j + 1] + a[i][j1 + 1];
+                x1r = a[i][j] - a[i][j1];
+                x1i = a[i][j + 1] - a[i][j1 + 1];
+                x2r = a[i][j2] + a[i][j3];
+                x2i = a[i][j2 + 1] + a[i][j3 + 1];
+                x3r = a[i][j2] - a[i][j3];
+                x3i = a[i][j2 + 1] - a[i][j3 + 1];
+                a[i][j] = x0r + x2r;
+                a[i][j + 1] = x0i + x2i;
+                a[i][j2] = x0r - x2r;
+                a[i][j2 + 1] = x0i - x2i;
+                a[i][j1] = x1r + x3i;
+                a[i][j1 + 1] = x1i - x3r;
+                a[i][j3] = x1r - x3i;
+                a[i][j3 + 1] = x1i + x3r;
+            }
+            if (m < n) {
+                wk1r = w[2];
+                for (j = m; j <= l + m - 2; j += 2) {
+                    j1 = j + l;
+                    j2 = j1 + l;
+                    j3 = j2 + l;
+                    x0r = a[i][j] + a[i][j1];
+                    x0i = a[i][j + 1] + a[i][j1 + 1];
+                    x1r = a[i][j] - a[i][j1];
+                    x1i = a[i][j + 1] - a[i][j1 + 1];
+                    x2r = a[i][j2] + a[i][j3];
+                    x2i = a[i][j2 + 1] + a[i][j3 + 1];
+                    x3r = a[i][j2] - a[i][j3];
+                    x3i = a[i][j2 + 1] - a[i][j3 + 1];
+                    a[i][j] = x0r + x2r;
+                    a[i][j + 1] = x0i + x2i;
+                    a[i][j2] = x0i - x2i;
+                    a[i][j2 + 1] = x2r - x0r;
+                    x0r = x1r + x3i;
+                    x0i = x1i - x3r;
+                    a[i][j1] = wk1r * (x0i + x0r);
+                    a[i][j1 + 1] = wk1r * (x0i - x0r);
+                    x0r = x3i - x1r;
+                    x0i = x3r + x1i;
+                    a[i][j3] = wk1r * (x0r + x0i);
+                    a[i][j3 + 1] = wk1r * (x0r - x0i);
+                }
+                k1 = 1;
+                ks = -1;
+                for (k = (m << 1); k <= n - m; k += m) {
+                    k1++;
+                    ks = -ks;
+                    wk1r = w[k1 << 1];
+                    wk1i = w[(k1 << 1) + 1];
+                    wk2r = ks * w[k1];
+                    wk2i = w[k1 + ks];
+                    wk3r = wk1r - 2 * wk2i * wk1i;
+                    wk3i = 2 * wk2i * wk1r - wk1i;
+                    for (j = k; j <= l + k - 2; j += 2) {
+                        j1 = j + l;
+                        j2 = j1 + l;
+                        j3 = j2 + l;
+                        x0r = a[i][j] + a[i][j1];
+                        x0i = a[i][j + 1] + a[i][j1 + 1];
+                        x1r = a[i][j] - a[i][j1];
+                        x1i = a[i][j + 1] - a[i][j1 + 1];
+                        x2r = a[i][j2] + a[i][j3];
+                        x2i = a[i][j2 + 1] + a[i][j3 + 1];
+                        x3r = a[i][j2] - a[i][j3];
+                        x3i = a[i][j2 + 1] - a[i][j3 + 1];
+                        a[i][j] = x0r + x2r;
+                        a[i][j + 1] = x0i + x2i;
+                        x0r -= x2r;
+                        x0i -= x2i;
+                        a[i][j2] = wk2r * x0r + wk2i * x0i;
+                        a[i][j2 + 1] = wk2r * x0i - wk2i * x0r;
+                        x0r = x1r + x3i;
+                        x0i = x1i - x3r;
+                        a[i][j1] = wk1r * x0r + wk1i * x0i;
+                        a[i][j1 + 1] = wk1r * x0i - wk1i * x0r;
+                        x0r = x1r - x3i;
+                        x0i = x1i + x3r;
+                        a[i][j3] = wk3r * x0r + wk3i * x0i;
+                        a[i][j3 + 1] = wk3r * x0i - wk3i * x0r;
+                    }
+                }
+            }
+            l = m;
+        }
+        if (l < n) {
+            for (j = 0; j <= l - 2; j += 2) {
+                j1 = j + l;
+                x0r = a[i][j] - a[i][j1];
+                x0i = a[i][j + 1] - a[i][j1 + 1];
+                a[i][j] += a[i][j1];
+                a[i][j + 1] += a[i][j1 + 1];
+                a[i][j1] = x0r;
+                a[i][j1 + 1] = x0i;
+            }
+        }
+    }
+}
+
+
+void cftfrow(int n, int n2, double **a, double *w)
+{
+    int i, j, j1, j2, j3, k, k1, ks, l, m;
+    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    l = 1;
+    while ((l << 1) < n) {
+        m = l << 2;
+        for (j = 0; j <= l - 1; j++) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            for (i = 0; i <= n2 - 2; i += 2) {
+                x0r = a[j][i] + a[j1][i];
+                x0i = a[j][i + 1] + a[j1][i + 1];
+                x1r = a[j][i] - a[j1][i];
+                x1i = a[j][i + 1] - a[j1][i + 1];
+                x2r = a[j2][i] + a[j3][i];
+                x2i = a[j2][i + 1] + a[j3][i + 1];
+                x3r = a[j2][i] - a[j3][i];
+                x3i = a[j2][i + 1] - a[j3][i + 1];
+                a[j][i] = x0r + x2r;
+                a[j][i + 1] = x0i + x2i;
+                a[j2][i] = x0r - x2r;
+                a[j2][i + 1] = x0i - x2i;
+                a[j1][i] = x1r + x3i;
+                a[j1][i + 1] = x1i - x3r;
+                a[j3][i] = x1r - x3i;
+                a[j3][i + 1] = x1i + x3r;
+            }
+        }
+        if (m < n) {
+            wk1r = w[2];
+            for (j = m; j <= l + m - 1; j++) {
+                j1 = j + l;
+                j2 = j1 + l;
+                j3 = j2 + l;
+                for (i = 0; i <= n2 - 2; i += 2) {
+                    x0r = a[j][i] + a[j1][i];
+                    x0i = a[j][i + 1] + a[j1][i + 1];
+                    x1r = a[j][i] - a[j1][i];
+                    x1i = a[j][i + 1] - a[j1][i + 1];
+                    x2r = a[j2][i] + a[j3][i];
+                    x2i = a[j2][i + 1] + a[j3][i + 1];
+                    x3r = a[j2][i] - a[j3][i];
+                    x3i = a[j2][i + 1] - a[j3][i + 1];
+                    a[j][i] = x0r + x2r;
+                    a[j][i + 1] = x0i + x2i;
+                    a[j2][i] = x0i - x2i;
+                    a[j2][i + 1] = x2r - x0r;
+                    x0r = x1r + x3i;
+                    x0i = x1i - x3r;
+                    a[j1][i] = wk1r * (x0i + x0r);
+                    a[j1][i + 1] = wk1r * (x0i - x0r);
+                    x0r = x3i - x1r;
+                    x0i = x3r + x1i;
+                    a[j3][i] = wk1r * (x0r + x0i);
+                    a[j3][i + 1] = wk1r * (x0r - x0i);
+                }
+            }
+            k1 = 1;
+            ks = -1;
+            for (k = (m << 1); k <= n - m; k += m) {
+                k1++;
+                ks = -ks;
+                wk1r = w[k1 << 1];
+                wk1i = w[(k1 << 1) + 1];
+                wk2r = ks * w[k1];
+                wk2i = w[k1 + ks];
+                wk3r = wk1r - 2 * wk2i * wk1i;
+                wk3i = 2 * wk2i * wk1r - wk1i;
+                for (j = k; j <= l + k - 1; j++) {
+                    j1 = j + l;
+                    j2 = j1 + l;
+                    j3 = j2 + l;
+                    for (i = 0; i <= n2 - 2; i += 2) {
+                        x0r = a[j][i] + a[j1][i];
+                        x0i = a[j][i + 1] + a[j1][i + 1];
+                        x1r = a[j][i] - a[j1][i];
+                        x1i = a[j][i + 1] - a[j1][i + 1];
+                        x2r = a[j2][i] + a[j3][i];
+                        x2i = a[j2][i + 1] + a[j3][i + 1];
+                        x3r = a[j2][i] - a[j3][i];
+                        x3i = a[j2][i + 1] - a[j3][i + 1];
+                        a[j][i] = x0r + x2r;
+                        a[j][i + 1] = x0i + x2i;
+                        x0r -= x2r;
+                        x0i -= x2i;
+                        a[j2][i] = wk2r * x0r + wk2i * x0i;
+                        a[j2][i + 1] = wk2r * x0i - wk2i * x0r;
+                        x0r = x1r + x3i;
+                        x0i = x1i - x3r;
+                        a[j1][i] = wk1r * x0r + wk1i * x0i;
+                        a[j1][i + 1] = wk1r * x0i - wk1i * x0r;
+                        x0r = x1r - x3i;
+                        x0i = x1i + x3r;
+                        a[j3][i] = wk3r * x0r + wk3i * x0i;
+                        a[j3][i + 1] = wk3r * x0i - wk3i * x0r;
+                    }
+                }
+            }
+        }
+        l = m;
+    }
+    if (l < n) {
+        for (j = 0; j <= l - 1; j++) {
+            j1 = j + l;
+            for (i = 0; i <= n2 - 2; i += 2) {
+                x0r = a[j][i] - a[j1][i];
+                x0i = a[j][i + 1] - a[j1][i + 1];
+                a[j][i] += a[j1][i];
+                a[j][i + 1] += a[j1][i + 1];
+                a[j1][i] = x0r;
+                a[j1][i + 1] = x0i;
+            }
+        }
+    }
+}
+
+
+void rftbcol(int n1, int n, double **a, int nc, double *c)
+{
+    int i, j, k, kk, ks;
+    double wkr, wki, xr, xi, yr, yi;
+    
+    ks = (nc << 2) / n;
+    for (i = 0; i <= n1 - 1; i++) {
+        kk = 0;
+        for (k = (n >> 1) - 2; k >= 2; k -= 2) {
+            j = n - k;
+            kk += ks;
+            wkr = 0.5 - c[kk];
+            wki = c[nc - kk];
+            xr = a[i][k] - a[i][j];
+            xi = a[i][k + 1] + a[i][j + 1];
+            yr = wkr * xr - wki * xi;
+            yi = wkr * xi + wki * xr;
+            a[i][k] -= yr;
+            a[i][k + 1] -= yi;
+            a[i][j] += yr;
+            a[i][j + 1] -= yi;
+        }
+    }
+}
+
+
+void rftfcol(int n1, int n, double **a, int nc, double *c)
+{
+    int i, j, k, kk, ks;
+    double wkr, wki, xr, xi, yr, yi;
+    
+    ks = (nc << 2) / n;
+    for (i = 0; i <= n1 - 1; i++) {
+        kk = 0;
+        for (k = (n >> 1) - 2; k >= 2; k -= 2) {
+            j = n - k;
+            kk += ks;
+            wkr = 0.5 - c[kk];
+            wki = c[nc - kk];
+            xr = a[i][k] - a[i][j];
+            xi = a[i][k + 1] + a[i][j + 1];
+            yr = wkr * xr + wki * xi;
+            yi = wkr * xi - wki * xr;
+            a[i][k] -= yr;
+            a[i][k + 1] -= yi;
+            a[i][j] += yr;
+            a[i][j + 1] -= yi;
+        }
+    }
+}
+
+
+void dctbsub(int n1, int n2, double **a, int nc, double *c)
+{
+    int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
+    double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
+    
+    ks1 = nc / n1;
+    ks2 = nc / n2;
+    n1h = n1 >> 1;
+    kk1 = ks1;
+    for (k1 = 1; k1 <= n1h - 1; k1++) {
+        j1 = n1 - k1;
+        w1r = 2 * c[kk1];
+        w1i = 2 * c[nc - kk1];
+        kk1 += ks1;
+        kk2 = ks2;
+        for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+            x0r = w1r * c[kk2];
+            x0i = w1i * c[kk2];
+            x1r = w1r * c[nc - kk2];
+            x1i = w1i * c[nc - kk2];
+            wkr = x0r - x1i;
+            wki = x0i + x1r;
+            wji = x0r + x1i;
+            wjr = x0i - x1r;
+            kk2 += ks2;
+            x0r = wkr * a[k1][k2] - wki * a[k1][k2 + 1];
+            x0i = wkr * a[k1][k2 + 1] + wki * a[k1][k2];
+            x1r = wjr * a[j1][k2] - wji * a[j1][k2 + 1];
+            x1i = wjr * a[j1][k2 + 1] + wji * a[j1][k2];
+            a[k1][k2] = x0r + x1i;
+            a[k1][k2 + 1] = x0i - x1r;
+            a[j1][k2] = x1r + x0i;
+            a[j1][k2 + 1] = x1i - x0r;
+        }
+        wkr = w1r * 0.5;
+        wki = w1i * 0.5;
+        wjr = w1r * c[kk2];
+        wji = w1i * c[kk2];
+        x0r = a[k1][0] + a[j1][0];
+        x0i = a[k1][1] - a[j1][1];
+        x1r = a[k1][0] - a[j1][0];
+        x1i = a[k1][1] + a[j1][1];
+        a[k1][0] = wkr * x0r - wki * x0i;
+        a[k1][1] = wkr * x0i + wki * x0r;
+        a[j1][0] = -wjr * x1r + wji * x1i;
+        a[j1][1] = wjr * x1i + wji * x1r;
+    }
+    w1r = 2 * c[kk1];
+    kk2 = ks2;
+    for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+        wkr = 2 * c[kk2];
+        wki = 2 * c[nc - kk2];
+        wjr = w1r * wkr;
+        wji = w1r * wki;
+        kk2 += ks2;
+        x0i = wkr * a[0][k2 + 1] + wki * a[0][k2];
+        a[0][k2] = wkr * a[0][k2] - wki * a[0][k2 + 1];
+        a[0][k2 + 1] = x0i;
+        x0i = wjr * a[n1h][k2 + 1] + wji * a[n1h][k2];
+        a[n1h][k2] = wjr * a[n1h][k2] - wji * a[n1h][k2 + 1];
+        a[n1h][k2 + 1] = x0i;
+    }
+    a[0][1] *= w1r;
+    a[n1h][0] *= w1r;
+    a[n1h][1] *= 0.5;
+}
+
+
+void dctfsub(int n1, int n2, double **a, int nc, double *c)
+{
+    int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
+    double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
+    
+    ks1 = nc / n1;
+    ks2 = nc / n2;
+    n1h = n1 >> 1;
+    kk1 = ks1;
+    for (k1 = 1; k1 <= n1h - 1; k1++) {
+        j1 = n1 - k1;
+        w1r = 2 * c[kk1];
+        w1i = 2 * c[nc - kk1];
+        kk1 += ks1;
+        kk2 = ks2;
+        for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+            x0r = w1r * c[kk2];
+            x0i = w1i * c[kk2];
+            x1r = w1r * c[nc - kk2];
+            x1i = w1i * c[nc - kk2];
+            wkr = x0r - x1i;
+            wki = x0i + x1r;
+            wji = x0r + x1i;
+            wjr = x0i - x1r;
+            kk2 += ks2;
+            x0r = a[k1][k2] - a[j1][k2 + 1];
+            x0i = a[j1][k2] + a[k1][k2 + 1];
+            x1r = a[j1][k2] - a[k1][k2 + 1];
+            x1i = a[k1][k2] + a[j1][k2 + 1];
+            a[k1][k2] = wkr * x0r + wki * x0i;
+            a[k1][k2 + 1] = wkr * x0i - wki * x0r;
+            a[j1][k2] = wjr * x1r + wji * x1i;
+            a[j1][k2 + 1] = wjr * x1i - wji * x1r;
+        }
+        x0r = 2 * c[kk2];
+        wjr = x0r * w1r;
+        wji = x0r * w1i;
+        x0r = w1r * a[k1][0] + w1i * a[k1][1];
+        x0i = w1r * a[k1][1] - w1i * a[k1][0];
+        x1r = -wjr * a[j1][0] + wji * a[j1][1];
+        x1i = wjr * a[j1][1] + wji * a[j1][0];
+        a[k1][0] = x0r + x1r;
+        a[k1][1] = x1i + x0i;
+        a[j1][0] = x0r - x1r;
+        a[j1][1] = x1i - x0i;
+    }
+    w1r = 2 * c[kk1];
+    kk2 = ks2;
+    for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+        wkr = 2 * c[kk2];
+        wki = 2 * c[nc - kk2];
+        wjr = w1r * wkr;
+        wji = w1r * wki;
+        kk2 += ks2;
+        x0i = wkr * a[0][k2 + 1] - wki * a[0][k2];
+        a[0][k2] = wkr * a[0][k2] + wki * a[0][k2 + 1];
+        a[0][k2 + 1] = x0i;
+        x0i = wjr * a[n1h][k2 + 1] - wji * a[n1h][k2];
+        a[n1h][k2] = wjr * a[n1h][k2] + wji * a[n1h][k2 + 1];
+        a[n1h][k2 + 1] = x0i;
+    }
+    w1r *= 2;
+    a[0][0] *= 2;
+    a[0][1] *= w1r;
+    a[n1h][0] *= w1r;
+}
+
+
+void dstbsub(int n1, int n2, double **a, int nc, double *c)
+{
+    int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
+    double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
+    
+    ks1 = nc / n1;
+    ks2 = nc / n2;
+    n1h = n1 >> 1;
+    kk1 = ks1;
+    for (k1 = 1; k1 <= n1h - 1; k1++) {
+        j1 = n1 - k1;
+        w1r = 2 * c[kk1];
+        w1i = 2 * c[nc - kk1];
+        kk1 += ks1;
+        kk2 = ks2;
+        for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+            x0r = w1r * c[kk2];
+            x0i = w1i * c[kk2];
+            x1r = w1r * c[nc - kk2];
+            x1i = w1i * c[nc - kk2];
+            wkr = x0r - x1i;
+            wki = x0i + x1r;
+            wji = x0r + x1i;
+            wjr = x0i - x1r;
+            kk2 += ks2;
+            x0r = wkr * a[k1][k2] - wki * a[k1][k2 + 1];
+            x0i = wkr * a[k1][k2 + 1] + wki * a[k1][k2];
+            x1r = wjr * a[j1][k2] - wji * a[j1][k2 + 1];
+            x1i = wjr * a[j1][k2 + 1] + wji * a[j1][k2];
+            a[k1][k2] = x1i - x0r;
+            a[k1][k2 + 1] = x1r + x0i;
+            a[j1][k2] = x0i - x1r;
+            a[j1][k2 + 1] = x0r + x1i;
+        }
+        wkr = w1r * 0.5;
+        wki = w1i * 0.5;
+        wjr = w1r * c[kk2];
+        wji = w1i * c[kk2];
+        x0r = a[k1][0] + a[j1][0];
+        x0i = a[k1][1] - a[j1][1];
+        x1r = a[k1][0] - a[j1][0];
+        x1i = a[k1][1] + a[j1][1];
+        a[k1][1] = wkr * x0r - wki * x0i;
+        a[k1][0] = wkr * x0i + wki * x0r;
+        a[j1][1] = -wjr * x1r + wji * x1i;
+        a[j1][0] = wjr * x1i + wji * x1r;
+    }
+    w1r = 2 * c[kk1];
+    kk2 = ks2;
+    for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+        wkr = 2 * c[kk2];
+        wki = 2 * c[nc - kk2];
+        wjr = w1r * wkr;
+        wji = w1r * wki;
+        kk2 += ks2;
+        x0i = wkr * a[0][k2 + 1] + wki * a[0][k2];
+        a[0][k2 + 1] = wkr * a[0][k2] - wki * a[0][k2 + 1];
+        a[0][k2] = x0i;
+        x0i = wjr * a[n1h][k2 + 1] + wji * a[n1h][k2];
+        a[n1h][k2 + 1] = wjr * a[n1h][k2] - wji * a[n1h][k2 + 1];
+        a[n1h][k2] = x0i;
+    }
+    a[0][1] *= w1r;
+    a[n1h][0] *= w1r;
+    a[n1h][1] *= 0.5;
+}
+
+
+void dstfsub(int n1, int n2, double **a, int nc, double *c)
+{
+    int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
+    double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
+    
+    ks1 = nc / n1;
+    ks2 = nc / n2;
+    n1h = n1 >> 1;
+    kk1 = ks1;
+    for (k1 = 1; k1 <= n1h - 1; k1++) {
+        j1 = n1 - k1;
+        w1r = 2 * c[kk1];
+        w1i = 2 * c[nc - kk1];
+        kk1 += ks1;
+        kk2 = ks2;
+        for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+            x0r = w1r * c[kk2];
+            x0i = w1i * c[kk2];
+            x1r = w1r * c[nc - kk2];
+            x1i = w1i * c[nc - kk2];
+            wkr = x0r - x1i;
+            wki = x0i + x1r;
+            wji = x0r + x1i;
+            wjr = x0i - x1r;
+            kk2 += ks2;
+            x0r = a[j1][k2 + 1] - a[k1][k2];
+            x0i = a[k1][k2 + 1] + a[j1][k2];
+            x1r = a[k1][k2 + 1] - a[j1][k2];
+            x1i = a[j1][k2 + 1] + a[k1][k2];
+            a[k1][k2] = wkr * x0r + wki * x0i;
+            a[k1][k2 + 1] = wkr * x0i - wki * x0r;
+            a[j1][k2] = wjr * x1r + wji * x1i;
+            a[j1][k2 + 1] = wjr * x1i - wji * x1r;
+        }
+        x0r = 2 * c[kk2];
+        wjr = x0r * w1r;
+        wji = x0r * w1i;
+        x0r = w1r * a[k1][1] + w1i * a[k1][0];
+        x0i = w1r * a[k1][0] - w1i * a[k1][1];
+        x1r = -wjr * a[j1][1] + wji * a[j1][0];
+        x1i = wjr * a[j1][0] + wji * a[j1][1];
+        a[k1][0] = x0r + x1r;
+        a[k1][1] = x1i + x0i;
+        a[j1][0] = x0r - x1r;
+        a[j1][1] = x1i - x0i;
+    }
+    w1r = 2 * c[kk1];
+    kk2 = ks2;
+    for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+        wkr = 2 * c[kk2];
+        wki = 2 * c[nc - kk2];
+        wjr = w1r * wkr;
+        wji = w1r * wki;
+        kk2 += ks2;
+        x0i = wkr * a[0][k2] - wki * a[0][k2 + 1];
+        a[0][k2] = wkr * a[0][k2 + 1] + wki * a[0][k2];
+        a[0][k2 + 1] = x0i;
+        x0i = wjr * a[n1h][k2] - wji * a[n1h][k2 + 1];
+        a[n1h][k2] = wjr * a[n1h][k2 + 1] + wji * a[n1h][k2];
+        a[n1h][k2 + 1] = x0i;
+    }
+    w1r *= 2;
+    a[0][0] *= 2;
+    a[0][1] *= w1r;
+    a[n1h][0] *= w1r;
+}
+
diff --git a/fft4f2d.f b/fft4f2d.f
new file mode 100644
index 0000000..af529e3
--- /dev/null
+++ b/fft4f2d.f
@@ -0,0 +1,1591 @@
+! Fast Fourier/Cosine/Sine Transform
+!     dimension   :two
+!     data length :power of 2
+!     decimation  :frequency
+!     radix       :4, 2, row-column
+!     data        :inplace
+!     table       :use
+! subroutines
+!     cdft2d: Complex Discrete Fourier Transform
+!     rdft2d: Real Discrete Fourier Transform
+!     ddct2d: Discrete Cosine Transform
+!     ddst2d: Discrete Sine Transform
+!
+!
+! -------- Complex DFT (Discrete Fourier Transform) --------
+!     [definition]
+!         <case1>
+!             X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) * 
+!                            exp(2*pi*i*j1*k1/n1) * 
+!                            exp(2*pi*i*j2*k2/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!         <case2>
+!             X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) * 
+!                            exp(-2*pi*i*j1*k1/n1) * 
+!                            exp(-2*pi*i*j2*k2/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!         (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call cdft2d(n1max, 2*n1, n2, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call cdft2d(n1max, 2*n1, n2, -1, a, ip, w)
+!     [parameters]
+!         n1max  :row size of the 2D array (integer)
+!         2*n1   :data length (integer)
+!                 n1 >= 1, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 1, n2 = power of 2
+!         a(0:2*n1-1,0:n2-1)
+!                :input/output data (real*8)
+!                 input data
+!                     a(2*j1,j2) = Re(x(j1,j2)), 
+!                     a(2*j1+1,j2) = Im(x(j1,j2)), 
+!                     0<=j1<n1, 0<=j2<n2
+!                 output data
+!                     a(2*k1,k2) = Re(X(k1,k2)), 
+!                     a(2*k1+1,k2) = Im(X(k1,k2)), 
+!                     0<=k1<n1, 0<=k2<n2
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1, n2))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1/2, n2/2)
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call cdft2d(n1max, 2*n1, n2, -1, a, ip, w)
+!         is 
+!             call cdft2d(n1max, 2*n1, n2, 1, a, ip, w)
+!             do j2 = 0, n2 - 1
+!                 do j1 = 0, 2 * n1 - 1
+!                     a(j1, j2) = a(j1, j2) * (1.0d0 / (n1 * n2))
+!                 end do
+!             end do
+!         .
+!
+!
+! -------- Real DFT / Inverse of Real DFT --------
+!     [definition]
+!         <case1> RDFT
+!             R(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
+!                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!             I(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
+!                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!         <case2> IRDFT (excluding scale)
+!             a(k1,k2) = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
+!                            (R(j1,j2) * 
+!                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) + 
+!                            I(j1,j2) * 
+!                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)), 
+!                            0<=k1<n1, 0<=k2<n2
+!         (notes: R(n1-k1,n2-k2) = R(k1,k2), 
+!                 I(n1-k1,n2-k2) = -I(k1,k2), 
+!                 R(n1-k1,0) = R(k1,0), 
+!                 I(n1-k1,0) = -I(k1,0), 
+!                 R(0,n2-k2) = R(0,k2), 
+!                 I(0,n2-k2) = -I(0,k2), 
+!                 0<k1<n1, 0<k2<n2)
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call rdft2d(n1max, n1, n2, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call rdft2d(n1max, n1, n2, -1, a, ip, w)
+!     [parameters]
+!         n1max  :row size of the 2D array (integer)
+!         n1     :data length (integer)
+!                 n1 >= 2, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 2, n2 = power of 2
+!         a(0:n1-1,0:n2-1)
+!                :input/output data (real*8)
+!                 <case1>
+!                     output data
+!                         a(2*k1,k2) = R(k1,k2) = R(n1-k1,n2-k2), 
+!                         a(2*k1+1,k2) = I(k1,k2) = -I(n1-k1,n2-k2), 
+!                             0<k1<n1/2, 0<k2<n2, 
+!                         a(2*k1,0) = R(k1,0) = R(n1-k1,0), 
+!                         a(2*k1+1,0) = I(k1,0) = -I(n1-k1,0), 
+!                             0<k1<n1/2, 
+!                         a(0,k2) = R(0,k2) = R(0,n2-k2), 
+!                         a(1,k2) = I(0,k2) = -I(0,n2-k2), 
+!                         a(1,n2-k2) = R(n1/2,k2) = R(n1/2,n2-k2), 
+!                         a(0,n2-k2) = -I(n1/2,k2) = I(n1/2,n2-k2), 
+!                             0<k2<n2/2, 
+!                         a(0,0) = R(0,0), 
+!                         a(1,0) = R(n1/2,0), 
+!                         a(0,n2/2) = R(0,n2/2), 
+!                         a(1,n2/2) = R(n1/2,n2/2)
+!                 <case2>
+!                     input data
+!                         a(2*j1,j2) = R(j1,j2) = R(n1-j1,n2-j2), 
+!                         a(2*j1+1,j2) = I(j1,j2) = -I(n1-j1,n2-j2), 
+!                             0<j1<n1/2, 0<j2<n2, 
+!                         a(2*j1,0) = R(j1,0) = R(n1-j1,0), 
+!                         a(2*j1+1,0) = I(j1,0) = -I(n1-j1,0), 
+!                             0<j1<n1/2, 
+!                         a(0,j2) = R(0,j2) = R(0,n2-j2), 
+!                         a(1,j2) = I(0,j2) = -I(0,n2-j2), 
+!                         a(1,n2-j2) = R(n1/2,j2) = R(n1/2,n2-j2), 
+!                         a(0,n2-j2) = -I(n1/2,j2) = I(n1/2,n2-j2), 
+!                             0<j2<n2/2, 
+!                         a(0,0) = R(0,0), 
+!                         a(1,0) = R(n1/2,0), 
+!                         a(0,n2/2) = R(0,n2/2), 
+!                         a(1,n2/2) = R(n1/2,n2/2)
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1/2, n2))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1/4, n2/2) + n1/4
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call rdft2d(n1max, n1, n2, 1, a, ip, w)
+!         is 
+!             call rdft2d(n1max, n1, n2, -1, a, ip, w)
+!             do j2 = 0, n2 - 1
+!                 do j1 = 0, n1 - 1
+!                     a(j1, j2) = a(j1, j2) * (2.0d0 / (n1 * n2))
+!                 end do
+!             end do
+!         .
+!
+!
+! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+!     [definition]
+!         <case1> IDCT (excluding scale)
+!             C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
+!                            cos(pi*j1*(k1+1/2)/n1) * 
+!                            cos(pi*j2*(k2+1/2)/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!         <case2> DCT
+!             C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
+!                            cos(pi*(j1+1/2)*k1/n1) * 
+!                            cos(pi*(j2+1/2)*k2/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
+!     [parameters]
+!         n1max  :row size of the 2D array (integer)
+!         n1     :data length (integer)
+!                 n1 >= 2, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 2, n2 = power of 2
+!         a(0:n1-1,0:n2-1)
+!                :input/output data (real*8)
+!                 output data
+!                     a(k1,k2) = C(k1,k2), 0<=k1<n1, 0<=k2<n2
+!         t(0:n1-1,0:n2-1)
+!                :work area (real*8)
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1/2, n2))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1/4, n2/2) + max(n1, n2)
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
+!         is 
+!             do j1 = 0, n1 - 1
+!                 a(j1, 0) = a(j1, 0) * 0.5d0
+!             end do
+!             do j2 = 0, n2 - 1
+!                 a(0, j2) = a(0, j2) * 0.5d0
+!             end do
+!             call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
+!             do j2 = 0, n2 - 1
+!                 do j1 = 0, n1 - 1
+!                     a(j1, j2) = a(j1, j2) * (4.0d0 / (n1 * n2))
+!                 end do
+!             end do
+!         .
+!
+!
+! -------- DST (Discrete Sine Transform) / Inverse of DST --------
+!     [definition]
+!         <case1> IDST (excluding scale)
+!             S(k1,k2) = sum_j1=1^n1 sum_j2=1^n2 A(j1,j2) * 
+!                            sin(pi*j1*(k1+1/2)/n1) * 
+!                            sin(pi*j2*(k2+1/2)/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!         <case2> DST
+!             S(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
+!                            sin(pi*(j1+1/2)*k1/n1) * 
+!                            sin(pi*(j2+1/2)*k2/n2), 
+!                            0<k1<=n1, 0<k2<=n2
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
+!     [parameters]
+!         n1max  :row size of the 2D array (integer)
+!         n1     :data length (integer)
+!                 n1 >= 2, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 2, n2 = power of 2
+!         a(0:n1-1,0:n2-1)
+!                :input/output data (real*8)
+!                 <case1>
+!                     input data
+!                         a(j1,j2) = A(j1,j2), 0<j1<n1, 0<j2<n2, 
+!                         a(j1,0) = A(j1,n2), 0<j1<n1, 
+!                         a(0,j2) = A(n1,j2), 0<j2<n2, 
+!                         a(0,0) = A(n1,n2)
+!                         (i.e. A(j1,j2) = a(mod(j1,n1),mod(j2,n2)))
+!                     output data
+!                         a(k1,k2) = S(k1,k2), 0<=k1<n1, 0<=k2<n2
+!                 <case2>
+!                     output data
+!                         a(k1,k2) = S(k1,k2), 0<k1<n1, 0<k2<n2, 
+!                         a(k1,0) = S(k1,n2), 0<k1<n1, 
+!                         a(0,k2) = S(n1,k2), 0<k2<n2, 
+!                         a(0,0) = S(n1,n2)
+!                         (i.e. S(k1,k2) = a(mod(k1,n1),mod(k2,n2)))
+!         t(0:n1-1,0:n2-1)
+!                :work area (real*8)
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1/2, n2))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1/4, n2/2) + max(n1, n2)
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
+!         is 
+!             do j1 = 0, n1 - 1
+!                 a(j1, 0) = a(j1, 0) * 0.5d0
+!             end do
+!             do j2 = 0, n2 - 1
+!                 a(0, j2) = a(0, j2) * 0.5d0
+!             end do
+!             call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
+!             do j2 = 0, n2 - 1
+!                 do j1 = 0, n1 - 1
+!                     a(j1, j2) = a(j1, j2) * (4.0d0 / (n1 * n2))
+!                 end do
+!             end do
+!         .
+!
+!
+      subroutine cdft2d(n1max, n1, n2, isgn, a, ip, w)
+      integer n1max, n1, n2, isgn, ip(0 : *), n
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *)
+      n = max(n1, 2 * n2)
+      if (n .gt. 4 * ip(0)) then
+          call makewt(n / 4, ip, w)
+      end if
+      if (n1 .gt. 4) then
+          call bitrv2row(n1max, n1, n2, ip(2), a)
+      end if
+      if (n2 .gt. 2) then
+          call bitrv2col(n1max, n1, n2, ip(2), a)
+      end if
+      if (isgn .lt. 0) then
+          call cftfrow(n1max, n1, n2, a, w)
+          call cftfcol(n1max, n1, n2, a, w)
+      else
+          call cftbrow(n1max, n1, n2, a, w)
+          call cftbcol(n1max, n1, n2, a, w)
+      end if
+      end
+!
+      subroutine rdft2d(n1max, n1, n2, isgn, a, ip, w)
+      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, n2h, i, j
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *), xi
+      n = max(n1, 2 * n2)
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n1 .gt. 4 * nc) then
+          nc = n1 / 4
+          call makect(nc, ip, w(nw))
+      end if
+      n2h = n2 / 2
+      if (isgn .lt. 0) then
+          do i = 1, n2h - 1
+              j = n2 - i
+              xi = a(0, i) - a(0, j)
+              a(0, i) = a(0, i) + a(0, j)
+              a(0, j) = xi
+              xi = a(1, j) - a(1, i)
+              a(1, i) = a(1, i) + a(1, j)
+              a(1, j) = xi
+          end do
+          if (n2 .gt. 2) then
+              call bitrv2col(n1max, n1, n2, ip(2), a)
+          end if
+          call cftfcol(n1max, n1, n2, a, w)
+          do i = 0, n2 - 1
+              a(1, i) = 0.5d0 * (a(0, i) - a(1, i))
+              a(0, i) = a(0, i) - a(1, i)
+          end do
+          if (n1 .gt. 4) then
+              call rftfrow(n1max, n1, n2, a, nc, w(nw))
+              call bitrv2row(n1max, n1, n2, ip(2), a)
+          end if
+          call cftfrow(n1max, n1, n2, a, w)
+      else
+          if (n1 .gt. 4) then
+              call bitrv2row(n1max, n1, n2, ip(2), a)
+          end if
+          call cftbrow(n1max, n1, n2, a, w)
+          if (n1 .gt. 4) then
+              call rftbrow(n1max, n1, n2, a, nc, w(nw))
+          end if
+          do i = 0, n2 - 1
+              xi = a(0, i) - a(1, i)
+              a(0, i) = a(0, i) + a(1, i)
+              a(1, i) = xi
+          end do
+          if (n2 .gt. 2) then
+              call bitrv2col(n1max, n1, n2, ip(2), a)
+          end if
+          call cftbcol(n1max, n1, n2, a, w)
+          do i = 1, n2h - 1
+              j = n2 - i
+              a(0, j) = 0.5d0 * (a(0, i) - a(0, j))
+              a(0, i) = a(0, i) - a(0, j)
+              a(1, j) = 0.5d0 * (a(1, i) + a(1, j))
+              a(1, i) = a(1, i) - a(1, j)
+          end do
+      end if
+      end
+!
+      subroutine ddct2d(n1max, n1, n2, isgn, a, t, ip, w)
+      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, n1h, n2h, 
+     &    i, ix, ic, j, jx, jc
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), 
+     &    t(0 : n1max - 1, 0 : n2 - 1), w(0 : *), xi
+      n = max(n1, 2 * n2)
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n1 .gt. nc .or. n2 .gt. nc) then
+          nc = max(n1, n2)
+          call makect(nc, ip, w(nw))
+      end if
+      n1h = n1 / 2
+      n2h = n2 / 2
+      if (isgn .ge. 0) then
+          do i = 0, n2 - 1
+              do j = 1, n1h - 1
+                  jx = 2 * j
+                  t(jx, i) = a(j, i)
+                  t(jx + 1, i) = a(n1 - j, i)
+              end do
+          end do
+          t(0, 0) = a(0, 0)
+          t(1, 0) = a(n1h, 0)
+          t(0, n2h) = a(0, n2h)
+          t(1, n2h) = a(n1h, n2h)
+          do i = 1, n2h - 1
+              ic = n2 - i
+              t(0, i) = a(0, i)
+              t(1, ic) = a(n1h, i)
+              t(1, i) = a(0, ic)
+              t(0, ic) = a(n1h, ic)
+          end do
+          call dctfsub(n1max, n1, n2, t, nc, w(nw))
+          if (n2 .gt. 2) then
+              call bitrv2col(n1max, n1, n2, ip(2), t)
+          end if
+          call cftfcol(n1max, n1, n2, t, w)
+          do i = 0, n2 - 1
+              t(1, i) = 0.5d0 * (t(0, i) - t(1, i))
+              t(0, i) = t(0, i) - t(1, i)
+          end do
+          if (n1 .gt. 4) then
+              call rftfrow(n1max, n1, n2, t, nc, w(nw))
+              call bitrv2row(n1max, n1, n2, ip(2), t)
+          end if
+          call cftfrow(n1max, n1, n2, t, w)
+          do i = 0, n2h - 1
+              ix = 2 * i
+              ic = n2 - 1 - i
+              do j = 0, n1h - 1
+                  jx = 2 * j
+                  jc = n1 - 1 - j
+                  a(jx, ix) = t(j, i)
+                  a(jx + 1, ix) = t(jc, i)
+                  a(jx, ix + 1) = t(j, ic)
+                  a(jx + 1, ix + 1) = t(jc, ic)
+              end do
+          end do
+      else
+          do i = 0, n2h - 1
+              ix = 2 * i
+              ic = n2 - 1 - i
+              do j = 0, n1h - 1
+                  jx = 2 * j
+                  jc = n1 - 1 - j
+                  t(j, i) = a(jx, ix)
+                  t(jc, i) = a(jx + 1, ix)
+                  t(j, ic) = a(jx, ix + 1)
+                  t(jc, ic) = a(jx + 1, ix + 1)
+              end do
+          end do
+          if (n1 .gt. 4) then
+              call bitrv2row(n1max, n1, n2, ip(2), t)
+          end if
+          call cftbrow(n1max, n1, n2, t, w)
+          if (n1 .gt. 4) then
+              call rftbrow(n1max, n1, n2, t, nc, w(nw))
+          end if
+          do i = 0, n2 - 1
+              xi = t(0, i) - t(1, i)
+              t(0, i) = t(0, i) + t(1, i)
+              t(1, i) = xi
+          end do
+          if (n2 .gt. 2) then
+              call bitrv2col(n1max, n1, n2, ip(2), t)
+          end if
+          call cftbcol(n1max, n1, n2, t, w)
+          call dctbsub(n1max, n1, n2, t, nc, w(nw))
+          do i = 0, n2 - 1
+              do j = 1, n1h - 1
+                  jx = 2 * j
+                  a(j, i) = t(jx, i)
+                  a(n1 - j, i) = t(jx + 1, i)
+              end do
+          end do
+          a(0, 0) = t(0, 0)
+          a(n1h, 0) = t(1, 0)
+          a(0, n2h) = t(0, n2h)
+          a(n1h, n2h) = t(1, n2h)
+          do i = 1, n2h - 1
+              ic = n2 - i
+              a(0, i) = t(0, i)
+              a(n1h, i) = t(1, ic)
+              a(0, ic) = t(1, i)
+              a(n1h, ic) = t(0, ic)
+          end do
+      end if
+      end
+!
+      subroutine ddst2d(n1max, n1, n2, isgn, a, t, ip, w)
+      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, n1h, n2h, 
+     &    i, ix, ic, j, jx, jc
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), 
+     &    t(0 : n1max - 1, 0 : n2 - 1), w(0 : *), xi
+      n = max(n1, 2 * n2)
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n1 .gt. nc .or. n2 .gt. nc) then
+          nc = max(n1, n2)
+          call makect(nc, ip, w(nw))
+      end if
+      n1h = n1 / 2
+      n2h = n2 / 2
+      if (isgn .ge. 0) then
+          do i = 0, n2 - 1
+              do j = 1, n1h - 1
+                  jx = 2 * j
+                  t(jx, i) = a(j, i)
+                  t(jx + 1, i) = a(n1 - j, i)
+              end do
+          end do
+          t(0, 0) = a(0, 0)
+          t(1, 0) = a(n1h, 0)
+          t(0, n2h) = a(0, n2h)
+          t(1, n2h) = a(n1h, n2h)
+          do i = 1, n2h - 1
+              ic = n2 - i
+              t(0, i) = a(0, i)
+              t(1, ic) = a(n1h, i)
+              t(1, i) = a(0, ic)
+              t(0, ic) = a(n1h, ic)
+          end do
+          call dstfsub(n1max, n1, n2, t, nc, w(nw))
+          if (n2 .gt. 2) then
+              call bitrv2col(n1max, n1, n2, ip(2), t)
+          end if
+          call cftfcol(n1max, n1, n2, t, w)
+          do i = 0, n2 - 1
+              t(1, i) = 0.5d0 * (t(0, i) - t(1, i))
+              t(0, i) = t(0, i) - t(1, i)
+          end do
+          if (n1 .gt. 4) then
+              call rftfrow(n1max, n1, n2, t, nc, w(nw))
+              call bitrv2row(n1max, n1, n2, ip(2), t)
+          end if
+          call cftfrow(n1max, n1, n2, t, w)
+          do i = 0, n2h - 1
+              ix = 2 * i
+              ic = n2 - 1 - i
+              do j = 0, n1h - 1
+                  jx = 2 * j
+                  jc = n1 - 1 - j
+                  a(jx, ix) = t(j, i)
+                  a(jx + 1, ix) = -t(jc, i)
+                  a(jx, ix + 1) = -t(j, ic)
+                  a(jx + 1, ix + 1) = t(jc, ic)
+              end do
+          end do
+      else
+          do i = 0, n2h - 1
+              ix = 2 * i
+              ic = n2 - 1 - i
+              do j = 0, n1h - 1
+                  jx = 2 * j
+                  jc = n1 - 1 - j
+                  t(j, i) = a(jx, ix)
+                  t(jc, i) = -a(jx + 1, ix)
+                  t(j, ic) = -a(jx, ix + 1)
+                  t(jc, ic) = a(jx + 1, ix + 1)
+              end do
+          end do
+          if (n1 .gt. 4) then
+              call bitrv2row(n1max, n1, n2, ip(2), t)
+          end if
+          call cftbrow(n1max, n1, n2, t, w)
+          if (n1 .gt. 4) then
+              call rftbrow(n1max, n1, n2, t, nc, w(nw))
+          end if
+          do i = 0, n2 - 1
+              xi = t(0, i) - t(1, i)
+              t(0, i) = t(0, i) + t(1, i)
+              t(1, i) = xi
+          end do
+          if (n2 .gt. 2) then
+              call bitrv2col(n1max, n1, n2, ip(2), t)
+          end if
+          call cftbcol(n1max, n1, n2, t, w)
+          call dstbsub(n1max, n1, n2, t, nc, w(nw))
+          do i = 0, n2 - 1
+              do j = 1, n1h - 1
+                  jx = 2 * j
+                  a(j, i) = t(jx, i)
+                  a(n1 - j, i) = t(jx + 1, i)
+              end do
+          end do
+          a(0, 0) = t(0, 0)
+          a(n1h, 0) = t(1, 0)
+          a(0, n2h) = t(0, n2h)
+          a(n1h, n2h) = t(1, n2h)
+          do i = 1, n2h - 1
+              ic = n2 - i
+              a(0, i) = t(0, i)
+              a(n1h, i) = t(1, ic)
+              a(0, ic) = t(1, i)
+              a(n1h, ic) = t(0, ic)
+          end do
+      end if
+      end
+!
+! -------- initializing routines --------
+!
+      subroutine makewt(nw, ip, w)
+      integer nw, ip(0 : *), nwh, j
+      real*8 w(0 : nw - 1), delta, x, y
+      ip(0) = nw
+      ip(1) = 1
+      if (nw .gt. 2) then
+          nwh = nw / 2
+          delta = atan(1.0d0) / nwh
+          w(0) = 1
+          w(1) = 0
+          w(nwh) = cos(delta * nwh)
+          w(nwh + 1) = w(nwh)
+          do j = 2, nwh - 2, 2
+              x = cos(delta * j)
+              y = sin(delta * j)
+              w(j) = x
+              w(j + 1) = y
+              w(nw - j) = y
+              w(nw - j + 1) = x
+          end do
+          call bitrv2(nw, ip(2), w)
+      end if
+      end
+!
+      subroutine makect(nc, ip, c)
+      integer nc, ip(0 : *), nch, j
+      real*8 c(0 : nc - 1), delta
+      ip(1) = nc
+      if (nc .gt. 1) then
+          nch = nc / 2
+          delta = atan(1.0d0) / nch
+          c(0) = 0.5d0
+          c(nch) = 0.5d0 * cos(delta * nch)
+          do j = 1, nch - 1
+              c(j) = 0.5d0 * cos(delta * j)
+              c(nc - j) = 0.5d0 * sin(delta * j)
+          end do
+      end if
+      end
+!
+! -------- child routines --------
+!
+      subroutine bitrv2(n, ip, a)
+      integer n, ip(0 : *), j, j1, k, k1, l, m, m2
+      real*8 a(0 : n - 1), xr, xi
+      ip(0) = 0
+      l = n
+      m = 1
+      do while (4 * m .lt. l)
+          l = l / 2
+          do j = 0, m - 1
+              ip(m + j) = ip(j) + l
+          end do
+          m = m * 2
+      end do
+      if (4 * m .gt. l) then
+          do k = 1, m - 1
+              do j = 0, k - 1
+                  j1 = 2 * j + ip(k)
+                  k1 = 2 * k + ip(j)
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  a(j1) = a(k1)
+                  a(j1 + 1) = a(k1 + 1)
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+          end do
+      else
+          m2 = 2 * m
+          do k = 1, m - 1
+              do j = 0, k - 1
+                  j1 = 2 * j + ip(k)
+                  k1 = 2 * k + ip(j)
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  a(j1) = a(k1)
+                  a(j1 + 1) = a(k1 + 1)
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + m2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  a(j1) = a(k1)
+                  a(j1 + 1) = a(k1 + 1)
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+          end do
+      end if
+      end
+!
+      subroutine bitrv2row(n1max, n, n2, ip, a)
+      integer n1max, n, n2, ip(0 : *), i, j, j1, k, k1, l, m, m2
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), xr, xi
+      ip(0) = 0
+      l = n
+      m = 1
+      do while (4 * m .lt. l)
+          l = l / 2
+          do j = 0, m - 1
+              ip(m + j) = ip(j) + l
+          end do
+          m = m * 2
+      end do
+      if (4 * m .gt. l) then
+          do i = 0, n2 - 1
+              do k = 1, m - 1
+                  do j = 0, k - 1
+                      j1 = 2 * j + ip(k)
+                      k1 = 2 * k + ip(j)
+                      xr = a(j1, i)
+                      xi = a(j1 + 1, i)
+                      a(j1, i) = a(k1, i)
+                      a(j1 + 1, i) = a(k1 + 1, i)
+                      a(k1, i) = xr
+                      a(k1 + 1, i) = xi
+                  end do
+              end do
+          end do
+      else
+          m2 = 2 * m
+          do i = 0, n2 - 1
+              do k = 1, m - 1
+                  do j = 0, k - 1
+                      j1 = 2 * j + ip(k)
+                      k1 = 2 * k + ip(j)
+                      xr = a(j1, i)
+                      xi = a(j1 + 1, i)
+                      a(j1, i) = a(k1, i)
+                      a(j1 + 1, i) = a(k1 + 1, i)
+                      a(k1, i) = xr
+                      a(k1 + 1, i) = xi
+                      j1 = j1 + m2
+                      k1 = k1 + m2
+                      xr = a(j1, i)
+                      xi = a(j1 + 1, i)
+                      a(j1, i) = a(k1, i)
+                      a(j1 + 1, i) = a(k1 + 1, i)
+                      a(k1, i) = xr
+                      a(k1 + 1, i) = xi
+                  end do
+              end do
+          end do
+      end if
+      end
+!
+      subroutine bitrv2col(n1max, n1, n, ip, a)
+      integer n1max, n1, n, ip(0 : *), i, j, j1, k, k1, l, m
+      real*8 a(0 : n1max - 1, 0 : n - 1), xr, xi
+      ip(0) = 0
+      l = n
+      m = 1
+      do while (2 * m .lt. l)
+          l = l / 2
+          do j = 0, m - 1
+              ip(m + j) = ip(j) + l
+          end do
+          m = m * 2
+      end do
+      if (2 * m .gt. l) then
+          do k = 1, m - 1
+              do j = 0, k - 1
+                  j1 = j + ip(k)
+                  k1 = k + ip(j)
+                  do i = 0, n1 - 2, 2
+                      xr = a(i, j1)
+                      xi = a(i + 1, j1)
+                      a(i, j1) = a(i, k1)
+                      a(i + 1, j1) = a(i + 1, k1)
+                      a(i, k1) = xr
+                      a(i + 1, k1) = xi
+                  end do
+              end do
+          end do
+      else
+          do k = 1, m - 1
+              do j = 0, k - 1
+                  j1 = j + ip(k)
+                  k1 = k + ip(j)
+                  do i = 0, n1 - 2, 2
+                      xr = a(i, j1)
+                      xi = a(i + 1, j1)
+                      a(i, j1) = a(i, k1)
+                      a(i + 1, j1) = a(i + 1, k1)
+                      a(i, k1) = xr
+                      a(i + 1, k1) = xi
+                  end do
+                  j1 = j1 + m
+                  k1 = k1 + m
+                  do i = 0, n1 - 2, 2
+                      xr = a(i, j1)
+                      xi = a(i + 1, j1)
+                      a(i, j1) = a(i, k1)
+                      a(i + 1, j1) = a(i + 1, k1)
+                      a(i, k1) = xr
+                      a(i + 1, k1) = xi
+                  end do
+              end do
+          end do
+      end if
+      end
+!
+      subroutine cftbrow(n1max, n, n2, a, w)
+      integer n1max, n, n2, i, j, j1, j2, j3, k, k1, ks, l, m
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *)
+      real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      do i = 0, n2 - 1
+          l = 2
+          do while (2 * l .lt. n)
+              m = 4 * l
+              do j = 0, l - 2, 2
+                  j1 = j + l
+                  j2 = j1 + l
+                  j3 = j2 + l
+                  x0r = a(j, i) + a(j1, i)
+                  x0i = a(j + 1, i) + a(j1 + 1, i)
+                  x1r = a(j, i) - a(j1, i)
+                  x1i = a(j + 1, i) - a(j1 + 1, i)
+                  x2r = a(j2, i) + a(j3, i)
+                  x2i = a(j2 + 1, i) + a(j3 + 1, i)
+                  x3r = a(j2, i) - a(j3, i)
+                  x3i = a(j2 + 1, i) - a(j3 + 1, i)
+                  a(j, i) = x0r + x2r
+                  a(j + 1, i) = x0i + x2i
+                  a(j2, i) = x0r - x2r
+                  a(j2 + 1, i) = x0i - x2i
+                  a(j1, i) = x1r - x3i
+                  a(j1 + 1, i) = x1i + x3r
+                  a(j3, i) = x1r + x3i
+                  a(j3 + 1, i) = x1i - x3r
+              end do
+              if (m .lt. n) then
+                  wk1r = w(2)
+                  do j = m, l + m - 2, 2
+                      j1 = j + l
+                      j2 = j1 + l
+                      j3 = j2 + l
+                      x0r = a(j, i) + a(j1, i)
+                      x0i = a(j + 1, i) + a(j1 + 1, i)
+                      x1r = a(j, i) - a(j1, i)
+                      x1i = a(j + 1, i) - a(j1 + 1, i)
+                      x2r = a(j2, i) + a(j3, i)
+                      x2i = a(j2 + 1, i) + a(j3 + 1, i)
+                      x3r = a(j2, i) - a(j3, i)
+                      x3i = a(j2 + 1, i) - a(j3 + 1, i)
+                      a(j, i) = x0r + x2r
+                      a(j + 1, i) = x0i + x2i
+                      a(j2, i) = x2i - x0i
+                      a(j2 + 1, i) = x0r - x2r
+                      x0r = x1r - x3i
+                      x0i = x1i + x3r
+                      a(j1, i) = wk1r * (x0r - x0i)
+                      a(j1 + 1, i) = wk1r * (x0r + x0i)
+                      x0r = x3i + x1r
+                      x0i = x3r - x1i
+                      a(j3, i) = wk1r * (x0i - x0r)
+                      a(j3 + 1, i) = wk1r * (x0i + x0r)
+                  end do
+                  k1 = 1
+                  ks = -1
+                  do k = 2 * m, n - m, m
+                      k1 = k1 + 1
+                      ks = -ks
+                      wk1r = w(2 * k1)
+                      wk1i = w(2 * k1 + 1)
+                      wk2r = ks * w(k1)
+                      wk2i = w(k1 + ks)
+                      wk3r = wk1r - 2 * wk2i * wk1i
+                      wk3i = 2 * wk2i * wk1r - wk1i
+                      do j = k, l + k - 2, 2
+                          j1 = j + l
+                          j2 = j1 + l
+                          j3 = j2 + l
+                          x0r = a(j, i) + a(j1, i)
+                          x0i = a(j + 1, i) + a(j1 + 1, i)
+                          x1r = a(j, i) - a(j1, i)
+                          x1i = a(j + 1, i) - a(j1 + 1, i)
+                          x2r = a(j2, i) + a(j3, i)
+                          x2i = a(j2 + 1, i) + a(j3 + 1, i)
+                          x3r = a(j2, i) - a(j3, i)
+                          x3i = a(j2 + 1, i) - a(j3 + 1, i)
+                          a(j, i) = x0r + x2r
+                          a(j + 1, i) = x0i + x2i
+                          x0r = x0r - x2r
+                          x0i = x0i - x2i
+                          a(j2, i) = wk2r * x0r - wk2i * x0i
+                          a(j2 + 1, i) = wk2r * x0i + wk2i * x0r
+                          x0r = x1r - x3i
+                          x0i = x1i + x3r
+                          a(j1, i) = wk1r * x0r - wk1i * x0i
+                          a(j1 + 1, i) = wk1r * x0i + wk1i * x0r
+                          x0r = x1r + x3i
+                          x0i = x1i - x3r
+                          a(j3, i) = wk3r * x0r - wk3i * x0i
+                          a(j3 + 1, i) = wk3r * x0i + wk3i * x0r
+                      end do
+                  end do
+              end if
+              l = m
+          end do
+          if (l .lt. n) then
+              do j = 0, l - 2, 2
+                  j1 = j + l
+                  x0r = a(j, i) - a(j1, i)
+                  x0i = a(j + 1, i) - a(j1 + 1, i)
+                  a(j, i) = a(j, i) + a(j1, i)
+                  a(j + 1, i) = a(j + 1, i) + a(j1 + 1, i)
+                  a(j1, i) = x0r
+                  a(j1 + 1, i) = x0i
+              end do
+          end if
+      end do
+      end
+!
+      subroutine cftbcol(n1max, n1, n, a, w)
+      integer n1max, n1, n, i, j, j1, j2, j3, k, k1, ks, l, m
+      real*8 a(0 : n1max - 1, 0 : n - 1), w(0 : *)
+      real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      l = 1
+      do while (2 * l .lt. n)
+          m = 4 * l
+          do j = 0, l - 1
+              j1 = j + l
+              j2 = j1 + l
+              j3 = j2 + l
+              do i = 0, n1 - 2, 2
+                  x0r = a(i, j) + a(i, j1)
+                  x0i = a(i + 1, j) + a(i + 1, j1)
+                  x1r = a(i, j) - a(i, j1)
+                  x1i = a(i + 1, j) - a(i + 1, j1)
+                  x2r = a(i, j2) + a(i, j3)
+                  x2i = a(i + 1, j2) + a(i + 1, j3)
+                  x3r = a(i, j2) - a(i, j3)
+                  x3i = a(i + 1, j2) - a(i + 1, j3)
+                  a(i, j) = x0r + x2r
+                  a(i + 1, j) = x0i + x2i
+                  a(i, j2) = x0r - x2r
+                  a(i + 1, j2) = x0i - x2i
+                  a(i, j1) = x1r - x3i
+                  a(i + 1, j1) = x1i + x3r
+                  a(i, j3) = x1r + x3i
+                  a(i + 1, j3) = x1i - x3r
+              end do
+          end do
+          if (m .lt. n) then
+              wk1r = w(2)
+              do j = m, l + m - 1
+                  j1 = j + l
+                  j2 = j1 + l
+                  j3 = j2 + l
+                  do i = 0, n1 - 2, 2
+                      x0r = a(i, j) + a(i, j1)
+                      x0i = a(i + 1, j) + a(i + 1, j1)
+                      x1r = a(i, j) - a(i, j1)
+                      x1i = a(i + 1, j) - a(i + 1, j1)
+                      x2r = a(i, j2) + a(i, j3)
+                      x2i = a(i + 1, j2) + a(i + 1, j3)
+                      x3r = a(i, j2) - a(i, j3)
+                      x3i = a(i + 1, j2) - a(i + 1, j3)
+                      a(i, j) = x0r + x2r
+                      a(i + 1, j) = x0i + x2i
+                      a(i, j2) = x2i - x0i
+                      a(i + 1, j2) = x0r - x2r
+                      x0r = x1r - x3i
+                      x0i = x1i + x3r
+                      a(i, j1) = wk1r * (x0r - x0i)
+                      a(i + 1, j1) = wk1r * (x0r + x0i)
+                      x0r = x3i + x1r
+                      x0i = x3r - x1i
+                      a(i, j3) = wk1r * (x0i - x0r)
+                      a(i + 1, j3) = wk1r * (x0i + x0r)
+                  end do
+              end do
+              k1 = 1
+              ks = -1
+              do k = 2 * m, n - m, m
+                  k1 = k1 + 1
+                  ks = -ks
+                  wk1r = w(2 * k1)
+                  wk1i = w(2 * k1 + 1)
+                  wk2r = ks * w(k1)
+                  wk2i = w(k1 + ks)
+                  wk3r = wk1r - 2 * wk2i * wk1i
+                  wk3i = 2 * wk2i * wk1r - wk1i
+                  do j = k, l + k - 1
+                      j1 = j + l
+                      j2 = j1 + l
+                      j3 = j2 + l
+                      do i = 0, n1 - 2, 2
+                          x0r = a(i, j) + a(i, j1)
+                          x0i = a(i + 1, j) + a(i + 1, j1)
+                          x1r = a(i, j) - a(i, j1)
+                          x1i = a(i + 1, j) - a(i + 1, j1)
+                          x2r = a(i, j2) + a(i, j3)
+                          x2i = a(i + 1, j2) + a(i + 1, j3)
+                          x3r = a(i, j2) - a(i, j3)
+                          x3i = a(i + 1, j2) - a(i + 1, j3)
+                          a(i, j) = x0r + x2r
+                          a(i + 1, j) = x0i + x2i
+                          x0r = x0r - x2r
+                          x0i = x0i - x2i
+                          a(i, j2) = wk2r * x0r - wk2i * x0i
+                          a(i + 1, j2) = wk2r * x0i + wk2i * x0r
+                          x0r = x1r - x3i
+                          x0i = x1i + x3r
+                          a(i, j1) = wk1r * x0r - wk1i * x0i
+                          a(i + 1, j1) = wk1r * x0i + wk1i * x0r
+                          x0r = x1r + x3i
+                          x0i = x1i - x3r
+                          a(i, j3) = wk3r * x0r - wk3i * x0i
+                          a(i + 1, j3) = wk3r * x0i + wk3i * x0r
+                      end do
+                  end do
+              end do
+          end if
+          l = m
+      end do
+      if (l .lt. n) then
+          do j = 0, l - 1
+              j1 = j + l
+              do i = 0, n1 - 2, 2
+                  x0r = a(i, j) - a(i, j1)
+                  x0i = a(i + 1, j) - a(i + 1, j1)
+                  a(i, j) = a(i, j) + a(i, j1)
+                  a(i + 1, j) = a(i + 1, j) + a(i + 1, j1)
+                  a(i, j1) = x0r
+                  a(i + 1, j1) = x0i
+              end do
+          end do
+      end if
+      end
+!
+      subroutine cftfrow(n1max, n, n2, a, w)
+      integer n1max, n, n2, i, j, j1, j2, j3, k, k1, ks, l, m
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *)
+      real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      do i = 0, n2 - 1
+          l = 2
+          do while (2 * l .lt. n)
+              m = 4 * l
+              do j = 0, l - 2, 2
+                  j1 = j + l
+                  j2 = j1 + l
+                  j3 = j2 + l
+                  x0r = a(j, i) + a(j1, i)
+                  x0i = a(j + 1, i) + a(j1 + 1, i)
+                  x1r = a(j, i) - a(j1, i)
+                  x1i = a(j + 1, i) - a(j1 + 1, i)
+                  x2r = a(j2, i) + a(j3, i)
+                  x2i = a(j2 + 1, i) + a(j3 + 1, i)
+                  x3r = a(j2, i) - a(j3, i)
+                  x3i = a(j2 + 1, i) - a(j3 + 1, i)
+                  a(j, i) = x0r + x2r
+                  a(j + 1, i) = x0i + x2i
+                  a(j2, i) = x0r - x2r
+                  a(j2 + 1, i) = x0i - x2i
+                  a(j1, i) = x1r + x3i
+                  a(j1 + 1, i) = x1i - x3r
+                  a(j3, i) = x1r - x3i
+                  a(j3 + 1, i) = x1i + x3r
+              end do
+              if (m .lt. n) then
+                  wk1r = w(2)
+                  do j = m, l + m - 2, 2
+                      j1 = j + l
+                      j2 = j1 + l
+                      j3 = j2 + l
+                      x0r = a(j, i) + a(j1, i)
+                      x0i = a(j + 1, i) + a(j1 + 1, i)
+                      x1r = a(j, i) - a(j1, i)
+                      x1i = a(j + 1, i) - a(j1 + 1, i)
+                      x2r = a(j2, i) + a(j3, i)
+                      x2i = a(j2 + 1, i) + a(j3 + 1, i)
+                      x3r = a(j2, i) - a(j3, i)
+                      x3i = a(j2 + 1, i) - a(j3 + 1, i)
+                      a(j, i) = x0r + x2r
+                      a(j + 1, i) = x0i + x2i
+                      a(j2, i) = x0i - x2i
+                      a(j2 + 1, i) = x2r - x0r
+                      x0r = x1r + x3i
+                      x0i = x1i - x3r
+                      a(j1, i) = wk1r * (x0i + x0r)
+                      a(j1 + 1, i) = wk1r * (x0i - x0r)
+                      x0r = x3i - x1r
+                      x0i = x3r + x1i
+                      a(j3, i) = wk1r * (x0r + x0i)
+                      a(j3 + 1, i) = wk1r * (x0r - x0i)
+                  end do
+                  k1 = 1
+                  ks = -1
+                  do k = 2 * m, n - m, m
+                      k1 = k1 + 1
+                      ks = -ks
+                      wk1r = w(2 * k1)
+                      wk1i = w(2 * k1 + 1)
+                      wk2r = ks * w(k1)
+                      wk2i = w(k1 + ks)
+                      wk3r = wk1r - 2 * wk2i * wk1i
+                      wk3i = 2 * wk2i * wk1r - wk1i
+                      do j = k, l + k - 2, 2
+                          j1 = j + l
+                          j2 = j1 + l
+                          j3 = j2 + l
+                          x0r = a(j, i) + a(j1, i)
+                          x0i = a(j + 1, i) + a(j1 + 1, i)
+                          x1r = a(j, i) - a(j1, i)
+                          x1i = a(j + 1, i) - a(j1 + 1, i)
+                          x2r = a(j2, i) + a(j3, i)
+                          x2i = a(j2 + 1, i) + a(j3 + 1, i)
+                          x3r = a(j2, i) - a(j3, i)
+                          x3i = a(j2 + 1, i) - a(j3 + 1, i)
+                          a(j, i) = x0r + x2r
+                          a(j + 1, i) = x0i + x2i
+                          x0r = x0r - x2r
+                          x0i = x0i - x2i
+                          a(j2, i) = wk2r * x0r + wk2i * x0i
+                          a(j2 + 1, i) = wk2r * x0i - wk2i * x0r
+                          x0r = x1r + x3i
+                          x0i = x1i - x3r
+                          a(j1, i) = wk1r * x0r + wk1i * x0i
+                          a(j1 + 1, i) = wk1r * x0i - wk1i * x0r
+                          x0r = x1r - x3i
+                          x0i = x1i + x3r
+                          a(j3, i) = wk3r * x0r + wk3i * x0i
+                          a(j3 + 1, i) = wk3r * x0i - wk3i * x0r
+                      end do
+                  end do
+              end if
+              l = m
+          end do
+          if (l .lt. n) then
+              do j = 0, l - 2, 2
+                  j1 = j + l
+                  x0r = a(j, i) - a(j1, i)
+                  x0i = a(j + 1, i) - a(j1 + 1, i)
+                  a(j, i) = a(j, i) + a(j1, i)
+                  a(j + 1, i) = a(j + 1, i) + a(j1 + 1, i)
+                  a(j1, i) = x0r
+                  a(j1 + 1, i) = x0i
+              end do
+          end if
+      end do
+      end
+!
+      subroutine cftfcol(n1max, n1, n, a, w)
+      integer n1max, n1, n, i, j, j1, j2, j3, k, k1, ks, l, m
+      real*8 a(0 : n1max - 1, 0 : n - 1), w(0 : *)
+      real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      l = 1
+      do while (2 * l .lt. n)
+          m = 4 * l
+          do j = 0, l - 1
+              j1 = j + l
+              j2 = j1 + l
+              j3 = j2 + l
+              do i = 0, n1 - 2, 2
+                  x0r = a(i, j) + a(i, j1)
+                  x0i = a(i + 1, j) + a(i + 1, j1)
+                  x1r = a(i, j) - a(i, j1)
+                  x1i = a(i + 1, j) - a(i + 1, j1)
+                  x2r = a(i, j2) + a(i, j3)
+                  x2i = a(i + 1, j2) + a(i + 1, j3)
+                  x3r = a(i, j2) - a(i, j3)
+                  x3i = a(i + 1, j2) - a(i + 1, j3)
+                  a(i, j) = x0r + x2r
+                  a(i + 1, j) = x0i + x2i
+                  a(i, j2) = x0r - x2r
+                  a(i + 1, j2) = x0i - x2i
+                  a(i, j1) = x1r + x3i
+                  a(i + 1, j1) = x1i - x3r
+                  a(i, j3) = x1r - x3i
+                  a(i + 1, j3) = x1i + x3r
+              end do
+          end do
+          if (m .lt. n) then
+              wk1r = w(2)
+              do j = m, l + m - 1
+                  j1 = j + l
+                  j2 = j1 + l
+                  j3 = j2 + l
+                  do i = 0, n1 - 2, 2
+                      x0r = a(i, j) + a(i, j1)
+                      x0i = a(i + 1, j) + a(i + 1, j1)
+                      x1r = a(i, j) - a(i, j1)
+                      x1i = a(i + 1, j) - a(i + 1, j1)
+                      x2r = a(i, j2) + a(i, j3)
+                      x2i = a(i + 1, j2) + a(i + 1, j3)
+                      x3r = a(i, j2) - a(i, j3)
+                      x3i = a(i + 1, j2) - a(i + 1, j3)
+                      a(i, j) = x0r + x2r
+                      a(i + 1, j) = x0i + x2i
+                      a(i, j2) = x0i - x2i
+                      a(i + 1, j2) = x2r - x0r
+                      x0r = x1r + x3i
+                      x0i = x1i - x3r
+                      a(i, j1) = wk1r * (x0i + x0r)
+                      a(i + 1, j1) = wk1r * (x0i - x0r)
+                      x0r = x3i - x1r
+                      x0i = x3r + x1i
+                      a(i, j3) = wk1r * (x0r + x0i)
+                      a(i + 1, j3) = wk1r * (x0r - x0i)
+                  end do
+              end do
+              k1 = 1
+              ks = -1
+              do k = 2 * m, n - m, m
+                  k1 = k1 + 1
+                  ks = -ks
+                  wk1r = w(2 * k1)
+                  wk1i = w(2 * k1 + 1)
+                  wk2r = ks * w(k1)
+                  wk2i = w(k1 + ks)
+                  wk3r = wk1r - 2 * wk2i * wk1i
+                  wk3i = 2 * wk2i * wk1r - wk1i
+                  do j = k, l + k - 1
+                      j1 = j + l
+                      j2 = j1 + l
+                      j3 = j2 + l
+                      do i = 0, n1 - 2, 2
+                          x0r = a(i, j) + a(i, j1)
+                          x0i = a(i + 1, j) + a(i + 1, j1)
+                          x1r = a(i, j) - a(i, j1)
+                          x1i = a(i + 1, j) - a(i + 1, j1)
+                          x2r = a(i, j2) + a(i, j3)
+                          x2i = a(i + 1, j2) + a(i + 1, j3)
+                          x3r = a(i, j2) - a(i, j3)
+                          x3i = a(i + 1, j2) - a(i + 1, j3)
+                          a(i, j) = x0r + x2r
+                          a(i + 1, j) = x0i + x2i
+                          x0r = x0r - x2r
+                          x0i = x0i - x2i
+                          a(i, j2) = wk2r * x0r + wk2i * x0i
+                          a(i + 1, j2) = wk2r * x0i - wk2i * x0r
+                          x0r = x1r + x3i
+                          x0i = x1i - x3r
+                          a(i, j1) = wk1r * x0r + wk1i * x0i
+                          a(i + 1, j1) = wk1r * x0i - wk1i * x0r
+                          x0r = x1r - x3i
+                          x0i = x1i + x3r
+                          a(i, j3) = wk3r * x0r + wk3i * x0i
+                          a(i + 1, j3) = wk3r * x0i - wk3i * x0r
+                      end do
+                  end do
+              end do
+          end if
+          l = m
+      end do
+      if (l .lt. n) then
+          do j = 0, l - 1
+              j1 = j + l
+              do i = 0, n1 - 2, 2
+                  x0r = a(i, j) - a(i, j1)
+                  x0i = a(i + 1, j) - a(i + 1, j1)
+                  a(i, j) = a(i, j) + a(i, j1)
+                  a(i + 1, j) = a(i + 1, j) + a(i + 1, j1)
+                  a(i, j1) = x0r
+                  a(i + 1, j1) = x0i
+              end do
+          end do
+      end if
+      end
+!
+      subroutine rftbrow(n1max, n, n2, a, nc, c)
+      integer n1max, n, n2, nc, i, j, k, kk, ks
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
+     &    wkr, wki, xr, xi, yr, yi
+      ks = 4 * nc / n
+      do i = 0, n2 - 1
+          kk = 0
+          do k = n / 2 - 2, 2, -2
+              j = n - k
+              kk = kk + ks
+              wkr = 0.5d0 - c(kk)
+              wki = c(nc - kk)
+              xr = a(k, i) - a(j, i)
+              xi = a(k + 1, i) + a(j + 1, i)
+              yr = wkr * xr - wki * xi
+              yi = wkr * xi + wki * xr
+              a(k, i) = a(k, i) - yr
+              a(k + 1, i) = a(k + 1, i) - yi
+              a(j, i) = a(j, i) + yr
+              a(j + 1, i) = a(j + 1, i) - yi
+          end do
+      end do
+      end
+!
+      subroutine rftfrow(n1max, n, n2, a, nc, c)
+      integer n1max, n, n2, nc, i, j, k, kk, ks
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
+     &    wkr, wki, xr, xi, yr, yi
+      ks = 4 * nc / n
+      do i = 0, n2 - 1
+          kk = 0
+          do k = n / 2 - 2, 2, -2
+              j = n - k
+              kk = kk + ks
+              wkr = 0.5d0 - c(kk)
+              wki = c(nc - kk)
+              xr = a(k, i) - a(j, i)
+              xi = a(k + 1, i) + a(j + 1, i)
+              yr = wkr * xr + wki * xi
+              yi = wkr * xi - wki * xr
+              a(k, i) = a(k, i) - yr
+              a(k + 1, i) = a(k + 1, i) - yi
+              a(j, i) = a(j, i) + yr
+              a(j + 1, i) = a(j + 1, i) - yi
+          end do
+      end do
+      end
+!
+      subroutine dctbsub(n1max, n1, n2, a, nc, c)
+      integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2, 
+     &    k1, k2
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
+     &    w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
+      ks1 = nc / n1
+      ks2 = nc / n2
+      n2h = n2 / 2
+      kk2 = ks2
+      do k2 = 1, n2h - 1
+          j2 = n2 - k2
+          w2r = 2 * c(kk2)
+          w2i = 2 * c(nc - kk2)
+          kk2 = kk2 + ks2
+          kk1 = ks1
+          do k1 = 2, n1 - 2, 2
+              x0r = w2r * c(kk1)
+              x0i = w2i * c(kk1)
+              x1r = w2r * c(nc - kk1)
+              x1i = w2i * c(nc - kk1)
+              wkr = x0r - x1i
+              wki = x0i + x1r
+              wji = x0r + x1i
+              wjr = x0i - x1r
+              kk1 = kk1 + ks1
+              x0r = wkr * a(k1, k2) - wki * a(k1 + 1, k2)
+              x0i = wkr * a(k1 + 1, k2) + wki * a(k1, k2)
+              x1r = wjr * a(k1, j2) - wji * a(k1 + 1, j2)
+              x1i = wjr * a(k1 + 1, j2) + wji * a(k1, j2)
+              a(k1, k2) = x0r + x1i
+              a(k1 + 1, k2) = x0i - x1r
+              a(k1, j2) = x1r + x0i
+              a(k1 + 1, j2) = x1i - x0r
+          end do
+          wkr = w2r * 0.5d0
+          wki = w2i * 0.5d0
+          wjr = w2r * c(kk1)
+          wji = w2i * c(kk1)
+          x0r = a(0, k2) + a(0, j2)
+          x0i = a(1, k2) - a(1, j2)
+          x1r = a(0, k2) - a(0, j2)
+          x1i = a(1, k2) + a(1, j2)
+          a(0, k2) = wkr * x0r - wki * x0i
+          a(1, k2) = wkr * x0i + wki * x0r
+          a(0, j2) = -wjr * x1r + wji * x1i
+          a(1, j2) = wjr * x1i + wji * x1r
+      end do
+      w2r = 2 * c(kk2)
+      kk1 = ks1
+      do k1 = 2, n1 - 2, 2
+          wkr = 2 * c(kk1)
+          wki = 2 * c(nc - kk1)
+          wjr = w2r * wkr
+          wji = w2r * wki
+          kk1 = kk1 + ks1
+          x0i = wkr * a(k1 + 1, 0) + wki * a(k1, 0)
+          a(k1, 0) = wkr * a(k1, 0) - wki * a(k1 + 1, 0)
+          a(k1 + 1, 0) = x0i
+          x0i = wjr * a(k1 + 1, n2h) + wji * a(k1, n2h)
+          a(k1, n2h) = wjr * a(k1, n2h) - wji * a(k1 + 1, n2h)
+          a(k1 + 1, n2h) = x0i
+      end do
+      a(1, 0) = a(1, 0) * w2r
+      a(0, n2h) = a(0, n2h) * w2r
+      a(1, n2h) = a(1, n2h) * 0.5d0
+      end
+!
+      subroutine dctfsub(n1max, n1, n2, a, nc, c)
+      integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2, 
+     &    k1, k2
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
+     &    w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
+      ks1 = nc / n1
+      ks2 = nc / n2
+      n2h = n2 / 2
+      kk2 = ks2
+      do k2 = 1, n2h - 1
+          j2 = n2 - k2
+          w2r = 2 * c(kk2)
+          w2i = 2 * c(nc - kk2)
+          kk2 = kk2 + ks2
+          kk1 = ks1
+          do k1 = 2, n1 - 2, 2
+              x0r = w2r * c(kk1)
+              x0i = w2i * c(kk1)
+              x1r = w2r * c(nc - kk1)
+              x1i = w2i * c(nc - kk1)
+              wkr = x0r - x1i
+              wki = x0i + x1r
+              wji = x0r + x1i
+              wjr = x0i - x1r
+              kk1 = kk1 + ks1
+              x0r = a(k1, k2) - a(k1 + 1, j2)
+              x0i = a(k1, j2) + a(k1 + 1, k2)
+              x1r = a(k1, j2) - a(k1 + 1, k2)
+              x1i = a(k1, k2) + a(k1 + 1, j2)
+              a(k1, k2) = wkr * x0r + wki * x0i
+              a(k1 + 1, k2) = wkr * x0i - wki * x0r
+              a(k1, j2) = wjr * x1r + wji * x1i
+              a(k1 + 1, j2) = wjr * x1i - wji * x1r
+          end do
+          x0r = 2 * c(kk1)
+          wjr = x0r * w2r
+          wji = x0r * w2i
+          x0r = w2r * a(0, k2) + w2i * a(1, k2)
+          x0i = w2r * a(1, k2) - w2i * a(0, k2)
+          x1r = -wjr * a(0, j2) + wji * a(1, j2)
+          x1i = wjr * a(1, j2) + wji * a(0, j2)
+          a(0, k2) = x0r + x1r
+          a(1, k2) = x1i + x0i
+          a(0, j2) = x0r - x1r
+          a(1, j2) = x1i - x0i
+      end do
+      w2r = 2 * c(kk2)
+      kk1 = ks1
+      do k1 = 2, n1 - 2, 2
+          wkr = 2 * c(kk1)
+          wki = 2 * c(nc - kk1)
+          wjr = w2r * wkr
+          wji = w2r * wki
+          kk1 = kk1 + ks1
+          x0i = wkr * a(k1 + 1, 0) - wki * a(k1, 0)
+          a(k1, 0) = wkr * a(k1, 0) + wki * a(k1 + 1, 0)
+          a(k1 + 1, 0) = x0i
+          x0i = wjr * a(k1 + 1, n2h) - wji * a(k1, n2h)
+          a(k1, n2h) = wjr * a(k1, n2h) + wji * a(k1 + 1, n2h)
+          a(k1 + 1, n2h) = x0i
+      end do
+      w2r = w2r * 2
+      a(0, 0) = a(0, 0) * 2
+      a(1, 0) = a(1, 0) * w2r
+      a(0, n2h) = a(0, n2h) * w2r
+      end
+!
+      subroutine dstbsub(n1max, n1, n2, a, nc, c)
+      integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2, 
+     &    k1, k2
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
+     &    w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
+      ks1 = nc / n1
+      ks2 = nc / n2
+      n2h = n2 / 2
+      kk2 = ks2
+      do k2 = 1, n2h - 1
+          j2 = n2 - k2
+          w2r = 2 * c(kk2)
+          w2i = 2 * c(nc - kk2)
+          kk2 = kk2 + ks2
+          kk1 = ks1
+          do k1 = 2, n1 - 2, 2
+              x0r = w2r * c(kk1)
+              x0i = w2i * c(kk1)
+              x1r = w2r * c(nc - kk1)
+              x1i = w2i * c(nc - kk1)
+              wkr = x0r - x1i
+              wki = x0i + x1r
+              wji = x0r + x1i
+              wjr = x0i - x1r
+              kk1 = kk1 + ks1
+              x0r = wkr * a(k1, k2) - wki * a(k1 + 1, k2)
+              x0i = wkr * a(k1 + 1, k2) + wki * a(k1, k2)
+              x1r = wjr * a(k1, j2) - wji * a(k1 + 1, j2)
+              x1i = wjr * a(k1 + 1, j2) + wji * a(k1, j2)
+              a(k1, k2) = x1i - x0r
+              a(k1 + 1, k2) = x1r + x0i
+              a(k1, j2) = x0i - x1r
+              a(k1 + 1, j2) = x0r + x1i
+          end do
+          wkr = w2r * 0.5d0
+          wki = w2i * 0.5d0
+          wjr = w2r * c(kk1)
+          wji = w2i * c(kk1)
+          x0r = a(0, k2) + a(0, j2)
+          x0i = a(1, k2) - a(1, j2)
+          x1r = a(0, k2) - a(0, j2)
+          x1i = a(1, k2) + a(1, j2)
+          a(1, k2) = wkr * x0r - wki * x0i
+          a(0, k2) = wkr * x0i + wki * x0r
+          a(1, j2) = -wjr * x1r + wji * x1i
+          a(0, j2) = wjr * x1i + wji * x1r
+      end do
+      w2r = 2 * c(kk2)
+      kk1 = ks1
+      do k1 = 2, n1 - 2, 2
+          wkr = 2 * c(kk1)
+          wki = 2 * c(nc - kk1)
+          wjr = w2r * wkr
+          wji = w2r * wki
+          kk1 = kk1 + ks1
+          x0i = wkr * a(k1 + 1, 0) + wki * a(k1, 0)
+          a(k1 + 1, 0) = wkr * a(k1, 0) - wki * a(k1 + 1, 0)
+          a(k1, 0) = x0i
+          x0i = wjr * a(k1 + 1, n2h) + wji * a(k1, n2h)
+          a(k1 + 1, n2h) = wjr * a(k1, n2h) - wji * a(k1 + 1, n2h)
+          a(k1, n2h) = x0i
+      end do
+      a(1, 0) = a(1, 0) * w2r
+      a(0, n2h) = a(0, n2h) * w2r
+      a(1, n2h) = a(1, n2h) * 0.5d0
+      end
+!
+      subroutine dstfsub(n1max, n1, n2, a, nc, c)
+      integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2, 
+     &    k1, k2
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1), 
+     &    w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
+      ks1 = nc / n1
+      ks2 = nc / n2
+      n2h = n2 / 2
+      kk2 = ks2
+      do k2 = 1, n2h - 1
+          j2 = n2 - k2
+          w2r = 2 * c(kk2)
+          w2i = 2 * c(nc - kk2)
+          kk2 = kk2 + ks2
+          kk1 = ks1
+          do k1 = 2, n1 - 2, 2
+              x0r = w2r * c(kk1)
+              x0i = w2i * c(kk1)
+              x1r = w2r * c(nc - kk1)
+              x1i = w2i * c(nc - kk1)
+              wkr = x0r - x1i
+              wki = x0i + x1r
+              wji = x0r + x1i
+              wjr = x0i - x1r
+              kk1 = kk1 + ks1
+              x0r = a(k1 + 1, j2) - a(k1, k2)
+              x0i = a(k1 + 1, k2) + a(k1, j2)
+              x1r = a(k1 + 1, k2) - a(k1, j2)
+              x1i = a(k1 + 1, j2) + a(k1, k2)
+              a(k1, k2) = wkr * x0r + wki * x0i
+              a(k1 + 1, k2) = wkr * x0i - wki * x0r
+              a(k1, j2) = wjr * x1r + wji * x1i
+              a(k1 + 1, j2) = wjr * x1i - wji * x1r
+          end do
+          x0r = 2 * c(kk1)
+          wjr = x0r * w2r
+          wji = x0r * w2i
+          x0r = w2r * a(1, k2) + w2i * a(0, k2)
+          x0i = w2r * a(0, k2) - w2i * a(1, k2)
+          x1r = -wjr * a(1, j2) + wji * a(0, j2)
+          x1i = wjr * a(0, j2) + wji * a(1, j2)
+          a(0, k2) = x0r + x1r
+          a(1, k2) = x1i + x0i
+          a(0, j2) = x0r - x1r
+          a(1, j2) = x1i - x0i
+      end do
+      w2r = 2 * c(kk2)
+      kk1 = ks1
+      do k1 = 2, n1 - 2, 2
+          wkr = 2 * c(kk1)
+          wki = 2 * c(nc - kk1)
+          wjr = w2r * wkr
+          wji = w2r * wki
+          kk1 = kk1 + ks1
+          x0i = wkr * a(k1, 0) - wki * a(k1 + 1, 0)
+          a(k1, 0) = wkr * a(k1 + 1, 0) + wki * a(k1, 0)
+          a(k1 + 1, 0) = x0i
+          x0i = wjr * a(k1, n2h) - wji * a(k1 + 1, n2h)
+          a(k1, n2h) = wjr * a(k1 + 1, n2h) + wji * a(k1, n2h)
+          a(k1 + 1, n2h) = x0i
+      end do
+      w2r = w2r * 2
+      a(0, 0) = a(0, 0) * 2
+      a(1, 0) = a(1, 0) * w2r
+      a(0, n2h) = a(0, n2h) * w2r
+      end
+!
diff --git a/fft4g.c b/fft4g.c
new file mode 100644
index 0000000..192dfd5
--- /dev/null
+++ b/fft4g.c
@@ -0,0 +1,1345 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :4, 2
+    data        :inplace
+    table       :use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, double *, int *, double *);
+    void rdft(int, int, double *, int *, double *);
+    void ddct(int, int, double *, int *, double *);
+    void ddst(int, int, double *, int *, double *);
+    void dfct(int, double *, double *, int *, double *);
+    void dfst(int, double *, double *, int *, double *);
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft(2*n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft(2*n, -1, a, ip, w);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (double *)
+                        input data
+                            a[2*j] = Re(x[j]), 
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]), 
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            cdft(2*n, -1, a, ip, w);
+        is 
+            cdft(2*n, 1, a, ip, w);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + 
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + 
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            rdft(n, 1, a, ip, w);
+        is 
+            rdft(n, -1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddct(n, -1, a, ip, w);
+        is 
+            a[0] *= 0.5;
+            ddct(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddst(n, -1, a, ip, w);
+        is 
+            a[0] *= 0.5;
+            ddst(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        ip[0] = 0; // first time only
+        dfct(n, a, t, ip, w);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+        t[0...n/2]     :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+        is 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        ip[0] = 0; // first time only
+        dfst(n, a, t, ip, w);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+        t[0...n/2-1]   :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            dfst(n, a, t, ip, w);
+        is 
+            dfst(n, a, t, ip, w);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+Appendix :
+    The cos/sin table is recalculated when the larger table required.
+    w[] and ip[] are compatible with all routines.
+*/
+
+
+void cdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void bitrv2(int n, int *ip, double *a);
+    void bitrv2conj(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void cftbsub(int n, double *a, double *w);
+    
+    if (n > (ip[0] << 2)) {
+        makewt(n >> 2, ip, w);
+    }
+    if (n > 4) {
+        if (isgn >= 0) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+        } else {
+            bitrv2conj(n, ip + 2, a);
+            cftbsub(n, a, w);
+        }
+    } else if (n == 4) {
+        cftfsub(n, a, w);
+    }
+}
+
+
+void rdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void cftbsub(int n, double *a, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    int nw, nc;
+    double xi;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 2)) {
+        nc = n >> 2;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xi = a[0] - a[1];
+        a[0] += a[1];
+        a[1] = xi;
+    } else {
+        a[1] = 0.5 * (a[0] - a[1]);
+        a[0] -= a[1];
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+}
+
+
+void ddct(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void cftbsub(int n, double *a, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    void dctsub(int n, double *a, int nc, double *c);
+    int j, nw, nc;
+    double xr;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = a[j] - a[j - 1];
+            a[j] += a[j - 1];
+        }
+        a[1] = a[0] - xr;
+        a[0] += xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+    dctsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = a[j] - a[j + 1];
+            a[j] += a[j + 1];
+        }
+        a[n - 1] = xr;
+    }
+}
+
+
+void ddst(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void cftbsub(int n, double *a, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    void dstsub(int n, double *a, int nc, double *c);
+    int j, nw, nc;
+    double xr;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = -a[j] - a[j - 1];
+            a[j] -= a[j - 1];
+        }
+        a[1] = a[0] + xr;
+        a[0] -= xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+    dstsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = -a[j] - a[j + 1];
+            a[j] -= a[j + 1];
+        }
+        a[n - 1] = -xr;
+    }
+}
+
+
+void dfct(int n, double *a, double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void dctsub(int n, double *a, int nc, double *c);
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+    
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    m = n >> 1;
+    yi = a[m];
+    xi = a[0] + a[n];
+    a[0] -= a[n];
+    t[0] = xi - yi;
+    t[m] = xi + yi;
+    if (n > 2) {
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] - a[n - j];
+            xi = a[j] + a[n - j];
+            yr = a[k] - a[n - k];
+            yi = a[k] + a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi - yi;
+            t[k] = xi + yi;
+        }
+        t[mh] = a[mh] + a[n - mh];
+        a[mh] -= a[n - mh];
+        dctsub(m, a, nc, w + nw);
+        if (m > 4) {
+            bitrv2(m, ip + 2, a);
+            cftfsub(m, a, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, w);
+        }
+        a[n - 1] = a[0] - a[1];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] + a[j + 1];
+            a[2 * j - 1] = a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dctsub(m, t, nc, w + nw);
+            if (m > 4) {
+                bitrv2(m, ip + 2, t);
+                cftfsub(m, t, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, w);
+            }
+            a[n - l] = t[0] - t[1];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = t[j] - t[j + 1];
+                a[k + l] = t[j] + t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 0; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] - t[m + j];
+                t[k] = t[m + k] + t[m + j];
+            }
+            t[mh] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+        a[n] = t[2] - t[1];
+        a[0] = t[2] + t[1];
+    } else {
+        a[1] = a[0];
+        a[2] = t[0];
+        a[0] = t[1];
+    }
+}
+
+
+void dfst(int n, double *a, double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void dstsub(int n, double *a, int nc, double *c);
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+    
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    if (n > 2) {
+        m = n >> 1;
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] + a[n - j];
+            xi = a[j] - a[n - j];
+            yr = a[k] + a[n - k];
+            yi = a[k] - a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi + yi;
+            t[k] = xi - yi;
+        }
+        t[0] = a[mh] - a[n - mh];
+        a[mh] += a[n - mh];
+        a[0] = a[m];
+        dstsub(m, a, nc, w + nw);
+        if (m > 4) {
+            bitrv2(m, ip + 2, a);
+            cftfsub(m, a, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, w);
+        }
+        a[n - 1] = a[1] - a[0];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] - a[j + 1];
+            a[2 * j - 1] = -a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dstsub(m, t, nc, w + nw);
+            if (m > 4) {
+                bitrv2(m, ip + 2, t);
+                cftfsub(m, t, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, w);
+            }
+            a[n - l] = t[1] - t[0];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = -t[j] - t[j + 1];
+                a[k + l] = t[j] - t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 1; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] + t[m + j];
+                t[k] = t[m + k] - t[m + j];
+            }
+            t[0] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+    }
+    a[0] = 0;
+}
+
+
+/* -------- initializing routines -------- */
+
+
+#include <math.h>
+
+void makewt(int nw, int *ip, double *w)
+{
+    void bitrv2(int n, int *ip, double *a);
+    int j, nwh;
+    double delta, x, y;
+    
+    ip[0] = nw;
+    ip[1] = 1;
+    if (nw > 2) {
+        nwh = nw >> 1;
+        delta = atan(1.0) / nwh;
+        w[0] = 1;
+        w[1] = 0;
+        w[nwh] = cos(delta * nwh);
+        w[nwh + 1] = w[nwh];
+        if (nwh > 2) {
+            for (j = 2; j < nwh; j += 2) {
+                x = cos(delta * j);
+                y = sin(delta * j);
+                w[j] = x;
+                w[j + 1] = y;
+                w[nw - j] = y;
+                w[nw - j + 1] = x;
+            }
+            bitrv2(nw, ip + 2, w);
+        }
+    }
+}
+
+
+void makect(int nc, int *ip, double *c)
+{
+    int j, nch;
+    double delta;
+    
+    ip[1] = nc;
+    if (nc > 1) {
+        nch = nc >> 1;
+        delta = atan(1.0) / nch;
+        c[0] = cos(delta * nch);
+        c[nch] = 0.5 * c[0];
+        for (j = 1; j < nch; j++) {
+            c[j] = 0.5 * cos(delta * j);
+            c[nc - j] = 0.5 * sin(delta * j);
+        }
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+void bitrv2(int n, int *ip, double *a)
+{
+    int j, j1, k, k1, l, m, m2;
+    double xr, xi, yr, yi;
+    
+    ip[0] = 0;
+    l = n;
+    m = 1;
+    while ((m << 3) < l) {
+        l >>= 1;
+        for (j = 0; j < m; j++) {
+            ip[m + j] = ip[j] + l;
+        }
+        m <<= 1;
+    }
+    m2 = 2 * m;
+    if ((m << 3) == l) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 -= m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            j1 = 2 * k + m2 + ip[k];
+            k1 = j1 + m2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    } else {
+        for (k = 1; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+        }
+    }
+}
+
+
+void bitrv2conj(int n, int *ip, double *a)
+{
+    int j, j1, k, k1, l, m, m2;
+    double xr, xi, yr, yi;
+    
+    ip[0] = 0;
+    l = n;
+    m = 1;
+    while ((m << 3) < l) {
+        l >>= 1;
+        for (j = 0; j < m; j++) {
+            ip[m + j] = ip[j] + l;
+        }
+        m <<= 1;
+    }
+    m2 = 2 * m;
+    if ((m << 3) == l) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 -= m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 2 * k + ip[k];
+            a[k1 + 1] = -a[k1 + 1];
+            j1 = k1 + m2;
+            k1 = j1 + m2;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            k1 += m2;
+            a[k1 + 1] = -a[k1 + 1];
+        }
+    } else {
+        a[1] = -a[1];
+        a[m2 + 1] = -a[m2 + 1];
+        for (k = 1; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 2 * k + ip[k];
+            a[k1 + 1] = -a[k1 + 1];
+            a[k1 + m2 + 1] = -a[k1 + m2 + 1];
+        }
+    }
+}
+
+
+void cftfsub(int n, double *a, double *w)
+{
+    void cft1st(int n, double *a, double *w);
+    void cftmdl(int n, int l, double *a, double *w);
+    int j, j1, j2, j3, l;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    l = 2;
+    if (n > 8) {
+        cft1st(n, a, w);
+        l = 8;
+        while ((l << 2) < n) {
+            cftmdl(n, l, a, w);
+            l <<= 2;
+        }
+    }
+    if ((l << 2) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i - x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i + x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i - x3r;
+        }
+    } else {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = a[j + 1] - a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] += a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+void cftbsub(int n, double *a, double *w)
+{
+    void cft1st(int n, double *a, double *w);
+    void cftmdl(int n, int l, double *a, double *w);
+    int j, j1, j2, j3, l;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    l = 2;
+    if (n > 8) {
+        cft1st(n, a, w);
+        l = 8;
+        while ((l << 2) < n) {
+            cftmdl(n, l, a, w);
+            l <<= 2;
+        }
+    }
+    if ((l << 2) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = -a[j + 1] - a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = -a[j + 1] + a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i - x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i + x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i - x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i + x3r;
+        }
+    } else {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = -a[j + 1] + a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] = -a[j + 1] - a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+void cft1st(int n, double *a, double *w)
+{
+    int j, k1, k2;
+    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    x0r = a[0] + a[2];
+    x0i = a[1] + a[3];
+    x1r = a[0] - a[2];
+    x1i = a[1] - a[3];
+    x2r = a[4] + a[6];
+    x2i = a[5] + a[7];
+    x3r = a[4] - a[6];
+    x3i = a[5] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[2] = x1r - x3i;
+    a[3] = x1i + x3r;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+    wk1r = w[2];
+    x0r = a[8] + a[10];
+    x0i = a[9] + a[11];
+    x1r = a[8] - a[10];
+    x1i = a[9] - a[11];
+    x2r = a[12] + a[14];
+    x2i = a[13] + a[15];
+    x3r = a[12] - a[14];
+    x3i = a[13] - a[15];
+    a[8] = x0r + x2r;
+    a[9] = x0i + x2i;
+    a[12] = x2i - x0i;
+    a[13] = x0r - x2r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[10] = wk1r * (x0r - x0i);
+    a[11] = wk1r * (x0r + x0i);
+    x0r = x3i + x1r;
+    x0i = x3r - x1i;
+    a[14] = wk1r * (x0i - x0r);
+    a[15] = wk1r * (x0i + x0r);
+    k1 = 0;
+    for (j = 16; j < n; j += 16) {
+        k1 += 2;
+        k2 = 2 * k1;
+        wk2r = w[k1];
+        wk2i = w[k1 + 1];
+        wk1r = w[k2];
+        wk1i = w[k2 + 1];
+        wk3r = wk1r - 2 * wk2i * wk1i;
+        wk3i = 2 * wk2i * wk1r - wk1i;
+        x0r = a[j] + a[j + 2];
+        x0i = a[j + 1] + a[j + 3];
+        x1r = a[j] - a[j + 2];
+        x1i = a[j + 1] - a[j + 3];
+        x2r = a[j + 4] + a[j + 6];
+        x2i = a[j + 5] + a[j + 7];
+        x3r = a[j + 4] - a[j + 6];
+        x3i = a[j + 5] - a[j + 7];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        x0r -= x2r;
+        x0i -= x2i;
+        a[j + 4] = wk2r * x0r - wk2i * x0i;
+        a[j + 5] = wk2r * x0i + wk2i * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j + 2] = wk1r * x0r - wk1i * x0i;
+        a[j + 3] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j + 6] = wk3r * x0r - wk3i * x0i;
+        a[j + 7] = wk3r * x0i + wk3i * x0r;
+        wk1r = w[k2 + 2];
+        wk1i = w[k2 + 3];
+        wk3r = wk1r - 2 * wk2r * wk1i;
+        wk3i = 2 * wk2r * wk1r - wk1i;
+        x0r = a[j + 8] + a[j + 10];
+        x0i = a[j + 9] + a[j + 11];
+        x1r = a[j + 8] - a[j + 10];
+        x1i = a[j + 9] - a[j + 11];
+        x2r = a[j + 12] + a[j + 14];
+        x2i = a[j + 13] + a[j + 15];
+        x3r = a[j + 12] - a[j + 14];
+        x3i = a[j + 13] - a[j + 15];
+        a[j + 8] = x0r + x2r;
+        a[j + 9] = x0i + x2i;
+        x0r -= x2r;
+        x0i -= x2i;
+        a[j + 12] = -wk2i * x0r - wk2r * x0i;
+        a[j + 13] = -wk2i * x0i + wk2r * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j + 10] = wk1r * x0r - wk1i * x0i;
+        a[j + 11] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j + 14] = wk3r * x0r - wk3i * x0i;
+        a[j + 15] = wk3r * x0i + wk3i * x0r;
+    }
+}
+
+
+void cftmdl(int n, int l, double *a, double *w)
+{
+    int j, j1, j2, j3, k, k1, k2, m, m2;
+    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    m = l << 2;
+    for (j = 0; j < l; j += 2) {
+        j1 = j + l;
+        j2 = j1 + l;
+        j3 = j2 + l;
+        x0r = a[j] + a[j1];
+        x0i = a[j + 1] + a[j1 + 1];
+        x1r = a[j] - a[j1];
+        x1i = a[j + 1] - a[j1 + 1];
+        x2r = a[j2] + a[j3];
+        x2i = a[j2 + 1] + a[j3 + 1];
+        x3r = a[j2] - a[j3];
+        x3i = a[j2 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j2] = x0r - x2r;
+        a[j2 + 1] = x0i - x2i;
+        a[j1] = x1r - x3i;
+        a[j1 + 1] = x1i + x3r;
+        a[j3] = x1r + x3i;
+        a[j3 + 1] = x1i - x3r;
+    }
+    wk1r = w[2];
+    for (j = m; j < l + m; j += 2) {
+        j1 = j + l;
+        j2 = j1 + l;
+        j3 = j2 + l;
+        x0r = a[j] + a[j1];
+        x0i = a[j + 1] + a[j1 + 1];
+        x1r = a[j] - a[j1];
+        x1i = a[j + 1] - a[j1 + 1];
+        x2r = a[j2] + a[j3];
+        x2i = a[j2 + 1] + a[j3 + 1];
+        x3r = a[j2] - a[j3];
+        x3i = a[j2 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j2] = x2i - x0i;
+        a[j2 + 1] = x0r - x2r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j1] = wk1r * (x0r - x0i);
+        a[j1 + 1] = wk1r * (x0r + x0i);
+        x0r = x3i + x1r;
+        x0i = x3r - x1i;
+        a[j3] = wk1r * (x0i - x0r);
+        a[j3 + 1] = wk1r * (x0i + x0r);
+    }
+    k1 = 0;
+    m2 = 2 * m;
+    for (k = m2; k < n; k += m2) {
+        k1 += 2;
+        k2 = 2 * k1;
+        wk2r = w[k1];
+        wk2i = w[k1 + 1];
+        wk1r = w[k2];
+        wk1i = w[k2 + 1];
+        wk3r = wk1r - 2 * wk2i * wk1i;
+        wk3i = 2 * wk2i * wk1r - wk1i;
+        for (j = k; j < l + k; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            x0r -= x2r;
+            x0i -= x2i;
+            a[j2] = wk2r * x0r - wk2i * x0i;
+            a[j2 + 1] = wk2r * x0i + wk2i * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j1] = wk1r * x0r - wk1i * x0i;
+            a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3r * x0r - wk3i * x0i;
+            a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+        }
+        wk1r = w[k2 + 2];
+        wk1i = w[k2 + 3];
+        wk3r = wk1r - 2 * wk2r * wk1i;
+        wk3i = 2 * wk2r * wk1r - wk1i;
+        for (j = k + m; j < l + (k + m); j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            x0r -= x2r;
+            x0i -= x2i;
+            a[j2] = -wk2i * x0r - wk2r * x0i;
+            a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j1] = wk1r * x0r - wk1i * x0i;
+            a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3r * x0r - wk3i * x0i;
+            a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+        }
+    }
+}
+
+
+void rftfsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+    
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr - wki * xi;
+        yi = wkr * xi + wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+
+void rftbsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+    
+    a[1] = -a[1];
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr + wki * xi;
+        yi = wkr * xi - wki * xr;
+        a[j] -= yr;
+        a[j + 1] = yi - a[j + 1];
+        a[k] += yr;
+        a[k + 1] = yi - a[k + 1];
+    }
+    a[m + 1] = -a[m + 1];
+}
+
+
+void dctsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+    
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[j] - wkr * a[k];
+        a[j] = wkr * a[j] + wki * a[k];
+        a[k] = xr;
+    }
+    a[m] *= c[0];
+}
+
+
+void dstsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+    
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[k] - wkr * a[j];
+        a[k] = wkr * a[k] + wki * a[j];
+        a[j] = xr;
+    }
+    a[m] *= c[0];
+}
+
diff --git a/fft4g.f b/fft4g.f
new file mode 100644
index 0000000..6c52ff3
--- /dev/null
+++ b/fft4g.f
@@ -0,0 +1,1240 @@
+! Fast Fourier/Cosine/Sine Transform
+!     dimension   :one
+!     data length :power of 2
+!     decimation  :frequency
+!     radix       :4, 2
+!     data        :inplace
+!     table       :use
+! subroutines
+!     cdft: Complex Discrete Fourier Transform
+!     rdft: Real Discrete Fourier Transform
+!     ddct: Discrete Cosine Transform
+!     ddst: Discrete Sine Transform
+!     dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+!     dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+!
+!
+! -------- Complex DFT (Discrete Fourier Transform) --------
+!     [definition]
+!         <case1>
+!             X(k) = sum_j=0^n-1 x(j)*exp(2*pi*i*j*k/n), 0<=k<n
+!         <case2>
+!             X(k) = sum_j=0^n-1 x(j)*exp(-2*pi*i*j*k/n), 0<=k<n
+!         (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call cdft(2*n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call cdft(2*n, -1, a, ip, w)
+!     [parameters]
+!         2*n          :data length (integer)
+!                       n >= 1, n = power of 2
+!         a(0:2*n-1)   :input/output data (real*8)
+!                       input data
+!                           a(2*j) = Re(x(j)), 
+!                           a(2*j+1) = Im(x(j)), 0<=j<n
+!                       output data
+!                           a(2*k) = Re(X(k)), 
+!                           a(2*k+1) = Im(X(k)), 0<=k<n
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n/2-1)   :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call cdft(2*n, -1, a, ip, w)
+!         is 
+!             call cdft(2*n, 1, a, ip, w)
+!             do j = 0, 2 * n - 1
+!                 a(j) = a(j) / n
+!             end do
+!         .
+!
+!
+! -------- Real DFT / Inverse of Real DFT --------
+!     [definition]
+!         <case1> RDFT
+!             R(k) = sum_j=0^n-1 a(j)*cos(2*pi*j*k/n), 0<=k<=n/2
+!             I(k) = sum_j=0^n-1 a(j)*sin(2*pi*j*k/n), 0<k<n/2
+!         <case2> IRDFT (excluding scale)
+!             a(k) = (R(0) + R(n/2)*cos(pi*k))/2 + 
+!                    sum_j=1^n/2-1 R(j)*cos(2*pi*j*k/n) + 
+!                    sum_j=1^n/2-1 I(j)*sin(2*pi*j*k/n), 0<=k<n
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call rdft(n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call rdft(n, -1, a, ip, w)
+!     [parameters]
+!         n            :data length (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       <case1>
+!                           output data
+!                               a(2*k) = R(k), 0<=k<n/2
+!                               a(2*k+1) = I(k), 0<k<n/2
+!                               a(1) = R(n/2)
+!                       <case2>
+!                           input data
+!                               a(2*j) = R(j), 0<=j<n/2
+!                               a(2*j+1) = I(j), 0<j<n/2
+!                               a(1) = R(n/2)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/2)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n/2-1)   :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call rdft(n, 1, a, ip, w)
+!         is 
+!             call rdft(n, -1, a, ip, w)
+!             do j = 0, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+!     [definition]
+!         <case1> IDCT (excluding scale)
+!             C(k) = sum_j=0^n-1 a(j)*cos(pi*j*(k+1/2)/n), 0<=k<n
+!         <case2> DCT
+!             C(k) = sum_j=0^n-1 a(j)*cos(pi*(j+1/2)*k/n), 0<=k<n
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddct(n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddct(n, -1, a, ip, w)
+!     [parameters]
+!         n            :data length (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       output data
+!                           a(k) = C(k), 0<=k<n
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/2)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/4-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddct(n, -1, a, ip, w)
+!         is 
+!             a(0) = a(0) / 2
+!             call ddct(n, 1, a, ip, w)
+!             do j = 0, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- DST (Discrete Sine Transform) / Inverse of DST --------
+!     [definition]
+!         <case1> IDST (excluding scale)
+!             S(k) = sum_j=1^n A(j)*sin(pi*j*(k+1/2)/n), 0<=k<n
+!         <case2> DST
+!             S(k) = sum_j=0^n-1 a(j)*sin(pi*(j+1/2)*k/n), 0<k<=n
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddst(n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddst(n, -1, a, ip, w)
+!     [parameters]
+!         n            :data length (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       <case1>
+!                           input data
+!                               a(j) = A(j), 0<j<n
+!                               a(0) = A(n)
+!                           output data
+!                               a(k) = S(k), 0<=k<n
+!                       <case2>
+!                           output data
+!                               a(k) = S(k), 0<k<n
+!                               a(0) = S(n)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/2)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/4-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddst(n, -1, a, ip, w)
+!         is 
+!             a(0) = a(0) / 2
+!             call ddst(n, 1, a, ip, w)
+!             do j = 0, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+!     [definition]
+!         C(k) = sum_j=0^n a(j)*cos(pi*j*k/n), 0<=k<=n
+!     [usage]
+!         ip(0) = 0  ! first time only
+!         call dfct(n, a, t, ip, w)
+!     [parameters]
+!         n            :data length - 1 (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n)       :input/output data (real*8)
+!                       output data
+!                           a(k) = C(k), 0<=k<=n
+!         t(0:n/2)     :work area (real*8)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/4)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/4+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/8-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             a(0) = a(0) / 2
+!             a(n) = a(n) / 2
+!             call dfct(n, a, t, ip, w)
+!         is 
+!             a(0) = a(0) / 2
+!             a(n) = a(n) / 2
+!             call dfct(n, a, t, ip, w)
+!             do j = 0, n
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+!     [definition]
+!         S(k) = sum_j=1^n-1 a(j)*sin(pi*j*k/n), 0<k<n
+!     [usage]
+!         ip(0) = 0  ! first time only
+!         call dfst(n, a, t, ip, w)
+!     [parameters]
+!         n            :data length + 1 (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       output data
+!                           a(k) = S(k), 0<k<n
+!                       (a(0) is used for work area)
+!         t(0:n/2-1)   :work area (real*8)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/4)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/4+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/8-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call dfst(n, a, t, ip, w)
+!         is 
+!             call dfst(n, a, t, ip, w)
+!             do j = 1, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! Appendix :
+!     The cos/sin table is recalculated when the larger table required.
+!     w() and ip() are compatible with all routines.
+!
+!
+      subroutine cdft(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *)
+      real*8 a(0 : n - 1), w(0 : *)
+      if (n .gt. 4 * ip(0)) then
+          call makewt(n / 4, ip, w)
+      end if
+      if (n .gt. 4) then
+          if (isgn .ge. 0) then
+              call bitrv2(n, ip(2), a)
+              call cftfsub(n, a, w)
+          else
+              call bitrv2conj(n, ip(2), a)
+              call cftbsub(n, a, w)
+          end if
+      else if (n .eq. 4) then
+          call cftfsub(n, a, w)
+      end if
+      end
+!
+      subroutine rdft(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *), nw, nc
+      real*8 a(0 : n - 1), w(0 : *), xi
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. 4 * nc) then
+          nc = n / 4
+          call makect(nc, ip, w(nw))
+      end if
+      if (isgn .ge. 0) then
+          if (n .gt. 4) then
+              call bitrv2(n, ip(2), a)
+              call cftfsub(n, a, w)
+              call rftfsub(n, a, nc, w(nw))
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+          xi = a(0) - a(1)
+          a(0) = a(0) + a(1)
+          a(1) = xi
+      else
+          a(1) = 0.5d0 * (a(0) - a(1))
+          a(0) = a(0) - a(1)
+          if (n .gt. 4) then
+              call rftbsub(n, a, nc, w(nw))
+              call bitrv2(n, ip(2), a)
+              call cftbsub(n, a, w)
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+      end if
+      end
+!
+      subroutine ddct(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *), j, nw, nc
+      real*8 a(0 : n - 1), w(0 : *), xr
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. nc) then
+          nc = n
+          call makect(nc, ip, w(nw))
+      end if
+      if (isgn .lt. 0) then
+          xr = a(n - 1)
+          do j = n - 2, 2, -2
+              a(j + 1) = a(j) - a(j - 1)
+              a(j) = a(j) + a(j - 1)
+          end do
+          a(1) = a(0) - xr
+          a(0) = a(0) + xr
+          if (n .gt. 4) then
+              call rftbsub(n, a, nc, w(nw))
+              call bitrv2(n, ip(2), a)
+              call cftbsub(n, a, w)
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+      end if
+      call dctsub(n, a, nc, w(nw))
+      if (isgn .ge. 0) then
+          if (n .gt. 4) then
+              call bitrv2(n, ip(2), a)
+              call cftfsub(n, a, w)
+              call rftfsub(n, a, nc, w(nw))
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+          xr = a(0) - a(1)
+          a(0) = a(0) + a(1)
+          do j = 2, n - 2, 2
+              a(j - 1) = a(j) - a(j + 1)
+              a(j) = a(j) + a(j + 1)
+          end do
+          a(n - 1) = xr
+      end if
+      end
+!
+      subroutine ddst(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *), j, nw, nc
+      real*8 a(0 : n - 1), w(0 : *), xr
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. nc) then
+          nc = n
+          call makect(nc, ip, w(nw))
+      end if
+      if (isgn .lt. 0) then
+          xr = a(n - 1)
+          do j = n - 2, 2, -2
+              a(j + 1) = -a(j) - a(j - 1)
+              a(j) = a(j) - a(j - 1)
+          end do
+          a(1) = a(0) + xr
+          a(0) = a(0) - xr
+          if (n .gt. 4) then
+              call rftbsub(n, a, nc, w(nw))
+              call bitrv2(n, ip(2), a)
+              call cftbsub(n, a, w)
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+      end if
+      call dstsub(n, a, nc, w(nw))
+      if (isgn .ge. 0) then
+          if (n .gt. 4) then
+              call bitrv2(n, ip(2), a)
+              call cftfsub(n, a, w)
+              call rftfsub(n, a, nc, w(nw))
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+          xr = a(0) - a(1)
+          a(0) = a(0) + a(1)
+          do j = 2, n - 2, 2
+              a(j - 1) = -a(j) - a(j + 1)
+              a(j) = a(j) - a(j + 1)
+          end do
+          a(n - 1) = -xr
+      end if
+      end
+!
+      subroutine dfct(n, a, t, ip, w)
+      integer n, ip(0 : *), j, k, l, m, mh, nw, nc
+      real*8 a(0 : n), t(0 : n / 2), w(0 : *), xr, xi, yr, yi
+      nw = ip(0)
+      if (n .gt. 8 * nw) then
+          nw = n / 8
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. 2 * nc) then
+          nc = n / 2
+          call makect(nc, ip, w(nw))
+      end if
+      m = n / 2
+      yi = a(m)
+      xi = a(0) + a(n)
+      a(0) = a(0) - a(n)
+      t(0) = xi - yi
+      t(m) = xi + yi
+      if (n .gt. 2) then
+          mh = m / 2
+          do j = 1, mh - 1
+              k = m - j
+              xr = a(j) - a(n - j)
+              xi = a(j) + a(n - j)
+              yr = a(k) - a(n - k)
+              yi = a(k) + a(n - k)
+              a(j) = xr
+              a(k) = yr
+              t(j) = xi - yi
+              t(k) = xi + yi
+          end do
+          t(mh) = a(mh) + a(n - mh)
+          a(mh) = a(mh) - a(n - mh)
+          call dctsub(m, a, nc, w(nw))
+          if (m .gt. 4) then
+              call bitrv2(m, ip(2), a)
+              call cftfsub(m, a, w)
+              call rftfsub(m, a, nc, w(nw))
+          else if (m .eq. 4) then
+              call cftfsub(m, a, w)
+          end if
+          a(n - 1) = a(0) - a(1)
+          a(1) = a(0) + a(1)
+          do j = m - 2, 2, -2
+              a(2 * j + 1) = a(j) + a(j + 1)
+              a(2 * j - 1) = a(j) - a(j + 1)
+          end do
+          l = 2
+          m = mh
+          do while (m .ge. 2)
+              call dctsub(m, t, nc, w(nw))
+              if (m .gt. 4) then
+                  call bitrv2(m, ip(2), t)
+                  call cftfsub(m, t, w)
+                  call rftfsub(m, t, nc, w(nw))
+              else if (m .eq. 4) then
+                  call cftfsub(m, t, w)
+              end if
+              a(n - l) = t(0) - t(1)
+              a(l) = t(0) + t(1)
+              k = 0
+              do j = 2, m - 2, 2
+                  k = k + 4 * l
+                  a(k - l) = t(j) - t(j + 1)
+                  a(k + l) = t(j) + t(j + 1)
+              end do
+              l = 2 * l
+              mh = m / 2
+              do j = 0, mh - 1
+                  k = m - j
+                  t(j) = t(m + k) - t(m + j)
+                  t(k) = t(m + k) + t(m + j)
+              end do
+              t(mh) = t(m + mh)
+              m = mh
+          end do
+          a(l) = t(0)
+          a(n) = t(2) - t(1)
+          a(0) = t(2) + t(1)
+      else
+          a(1) = a(0)
+          a(2) = t(0)
+          a(0) = t(1)
+      end if
+      end
+!
+      subroutine dfst(n, a, t, ip, w)
+      integer n, ip(0 : *), j, k, l, m, mh, nw, nc
+      real*8 a(0 : n - 1), t(0 : n / 2 - 1), w(0 : *), xr, xi, yr, yi
+      nw = ip(0)
+      if (n .gt. 8 * nw) then
+          nw = n / 8
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. 2 * nc) then
+          nc = n / 2
+          call makect(nc, ip, w(nw))
+      end if
+      if (n .gt. 2) then
+          m = n / 2
+          mh = m / 2
+          do j = 1, mh - 1
+              k = m - j
+              xr = a(j) + a(n - j)
+              xi = a(j) - a(n - j)
+              yr = a(k) + a(n - k)
+              yi = a(k) - a(n - k)
+              a(j) = xr
+              a(k) = yr
+              t(j) = xi + yi
+              t(k) = xi - yi
+          end do
+          t(0) = a(mh) - a(n - mh)
+          a(mh) = a(mh) + a(n - mh)
+          a(0) = a(m)
+          call dstsub(m, a, nc, w(nw))
+          if (m .gt. 4) then
+              call bitrv2(m, ip(2), a)
+              call cftfsub(m, a, w)
+              call rftfsub(m, a, nc, w(nw))
+          else if (m .eq. 4) then
+              call cftfsub(m, a, w)
+          end if
+          a(n - 1) = a(1) - a(0)
+          a(1) = a(0) + a(1)
+          do j = m - 2, 2, -2
+              a(2 * j + 1) = a(j) - a(j + 1)
+              a(2 * j - 1) = -a(j) - a(j + 1)
+          end do
+          l = 2
+          m = mh
+          do while (m .ge. 2)
+              call dstsub(m, t, nc, w(nw))
+              if (m .gt. 4) then
+                  call bitrv2(m, ip(2), t)
+                  call cftfsub(m, t, w)
+                  call rftfsub(m, t, nc, w(nw))
+              else if (m .eq. 4) then
+                  call cftfsub(m, t, w)
+              end if
+              a(n - l) = t(1) - t(0)
+              a(l) = t(0) + t(1)
+              k = 0
+              do j = 2, m - 2, 2
+                  k = k + 4 * l
+                  a(k - l) = -t(j) - t(j + 1)
+                  a(k + l) = t(j) - t(j + 1)
+              end do
+              l = 2 * l
+              mh = m / 2
+              do j = 1, mh - 1
+                  k = m - j
+                  t(j) = t(m + k) + t(m + j)
+                  t(k) = t(m + k) - t(m + j)
+              end do
+              t(0) = t(m + mh)
+              m = mh
+          end do
+          a(l) = t(0)
+      end if
+      a(0) = 0
+      end
+!
+! -------- initializing routines --------
+!
+      subroutine makewt(nw, ip, w)
+      integer nw, ip(0 : *), j, nwh
+      real*8 w(0 : nw - 1), delta, x, y
+      ip(0) = nw
+      ip(1) = 1
+      if (nw .gt. 2) then
+          nwh = nw / 2
+          delta = atan(1.0d0) / nwh
+          w(0) = 1
+          w(1) = 0
+          w(nwh) = cos(delta * nwh)
+          w(nwh + 1) = w(nwh)
+          if (nwh .gt. 2) then
+              do j = 2, nwh - 2, 2
+                  x = cos(delta * j)
+                  y = sin(delta * j)
+                  w(j) = x
+                  w(j + 1) = y
+                  w(nw - j) = y
+                  w(nw - j + 1) = x
+              end do
+              call bitrv2(nw, ip(2), w)
+          end if
+      end if
+      end
+!
+      subroutine makect(nc, ip, c)
+      integer nc, ip(0 : *), j, nch
+      real*8 c(0 : nc - 1), delta
+      ip(1) = nc
+      if (nc .gt. 1) then
+          nch = nc / 2
+          delta = atan(1.0d0) / nch
+          c(0) = cos(delta * nch)
+          c(nch) = 0.5d0 * c(0)
+          do j = 1, nch - 1
+              c(j) = 0.5d0 * cos(delta * j)
+              c(nc - j) = 0.5d0 * sin(delta * j)
+          end do
+      end if
+      end
+!
+! -------- child routines --------
+!
+      subroutine bitrv2(n, ip, a)
+      integer n, ip(0 : *), j, j1, k, k1, l, m, m2
+      real*8 a(0 : n - 1), xr, xi, yr, yi
+      ip(0) = 0
+      l = n
+      m = 1
+      do while (8 * m .lt. l)
+          l = l / 2
+          do j = 0, m - 1
+              ip(m + j) = ip(j) + l
+          end do
+          m = m * 2
+      end do
+      m2 = 2 * m
+      if (8 * m .eq. l) then
+          do k = 0, m - 1
+              do j = 0, k - 1
+                  j1 = 2 * j + ip(k)
+                  k1 = 2 * k + ip(j)
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + 2 * m2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 - m2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + 2 * m2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+              j1 = 2 * k + m2 + ip(k)
+              k1 = j1 + m2
+              xr = a(j1)
+              xi = a(j1 + 1)
+              yr = a(k1)
+              yi = a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+          end do
+      else
+          do k = 1, m - 1
+              do j = 0, k - 1
+                  j1 = 2 * j + ip(k)
+                  k1 = 2 * k + ip(j)
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + m2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+          end do
+      end if
+      end
+!
+      subroutine bitrv2conj(n, ip, a)
+      integer n, ip(0 : *), j, j1, k, k1, l, m, m2
+      real*8 a(0 : n - 1), xr, xi, yr, yi
+      ip(0) = 0
+      l = n
+      m = 1
+      do while (8 * m .lt. l)
+          l = l / 2
+          do j = 0, m - 1
+              ip(m + j) = ip(j) + l
+          end do
+          m = m * 2
+      end do
+      m2 = 2 * m
+      if (8 * m .eq. l) then
+          do k = 0, m - 1
+              do j = 0, k - 1
+                  j1 = 2 * j + ip(k)
+                  k1 = 2 * k + ip(j)
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + 2 * m2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 - m2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + 2 * m2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+              k1 = 2 * k + ip(k)
+              a(k1 + 1) = -a(k1 + 1)
+              j1 = k1 + m2
+              k1 = j1 + m2
+              xr = a(j1)
+              xi = -a(j1 + 1)
+              yr = a(k1)
+              yi = -a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              k1 = k1 + m2
+              a(k1 + 1) = -a(k1 + 1)
+          end do
+      else
+          a(1) = -a(1)
+          a(m2 + 1) = -a(m2 + 1)
+          do k = 1, m - 1
+              do j = 0, k - 1
+                  j1 = 2 * j + ip(k)
+                  k1 = 2 * k + ip(j)
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + m2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+              k1 = 2 * k + ip(k)
+              a(k1 + 1) = -a(k1 + 1)
+              a(k1 + m2 + 1) = -a(k1 + m2 + 1)
+          end do
+      end if
+      end
+!
+      subroutine cftfsub(n, a, w)
+      integer n, j, j1, j2, j3, l
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      l = 2
+      if (n .gt. 8) then
+          call cft1st(n, a, w)
+          l = 8
+          do while (4 * l .lt. n)
+              call cftmdl(n, l, a, w)
+              l = 4 * l
+          end do
+      end if
+      if (4 * l .eq. n) then
+          do j = 0, l - 2, 2
+              j1 = j + l
+              j2 = j1 + l
+              j3 = j2 + l
+              x0r = a(j) + a(j1)
+              x0i = a(j + 1) + a(j1 + 1)
+              x1r = a(j) - a(j1)
+              x1i = a(j + 1) - a(j1 + 1)
+              x2r = a(j2) + a(j3)
+              x2i = a(j2 + 1) + a(j3 + 1)
+              x3r = a(j2) - a(j3)
+              x3i = a(j2 + 1) - a(j3 + 1)
+              a(j) = x0r + x2r
+              a(j + 1) = x0i + x2i
+              a(j2) = x0r - x2r
+              a(j2 + 1) = x0i - x2i
+              a(j1) = x1r - x3i
+              a(j1 + 1) = x1i + x3r
+              a(j3) = x1r + x3i
+              a(j3 + 1) = x1i - x3r
+          end do
+      else
+          do j = 0, l - 2, 2
+              j1 = j + l
+              x0r = a(j) - a(j1)
+              x0i = a(j + 1) - a(j1 + 1)
+              a(j) = a(j) + a(j1)
+              a(j + 1) = a(j + 1) + a(j1 + 1)
+              a(j1) = x0r
+              a(j1 + 1) = x0i
+          end do
+      end if
+      end
+!
+      subroutine cftbsub(n, a, w)
+      integer n, j, j1, j2, j3, l
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      l = 2
+      if (n .gt. 8) then
+          call cft1st(n, a, w)
+          l = 8
+          do while (4 * l .lt. n)
+              call cftmdl(n, l, a, w)
+              l = 4 * l
+          end do
+      end if
+      if (4 * l .eq. n) then
+          do j = 0, l - 2, 2
+              j1 = j + l
+              j2 = j1 + l
+              j3 = j2 + l
+              x0r = a(j) + a(j1)
+              x0i = -a(j + 1) - a(j1 + 1)
+              x1r = a(j) - a(j1)
+              x1i = -a(j + 1) + a(j1 + 1)
+              x2r = a(j2) + a(j3)
+              x2i = a(j2 + 1) + a(j3 + 1)
+              x3r = a(j2) - a(j3)
+              x3i = a(j2 + 1) - a(j3 + 1)
+              a(j) = x0r + x2r
+              a(j + 1) = x0i - x2i
+              a(j2) = x0r - x2r
+              a(j2 + 1) = x0i + x2i
+              a(j1) = x1r - x3i
+              a(j1 + 1) = x1i - x3r
+              a(j3) = x1r + x3i
+              a(j3 + 1) = x1i + x3r
+          end do
+      else
+          do j = 0, l - 2, 2
+              j1 = j + l
+              x0r = a(j) - a(j1)
+              x0i = -a(j + 1) + a(j1 + 1)
+              a(j) = a(j) + a(j1)
+              a(j + 1) = -a(j + 1) - a(j1 + 1)
+              a(j1) = x0r
+              a(j1 + 1) = x0i
+          end do
+      end if
+      end
+!
+      subroutine cft1st(n, a, w)
+      integer n, j, k1, k2
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      x0r = a(0) + a(2)
+      x0i = a(1) + a(3)
+      x1r = a(0) - a(2)
+      x1i = a(1) - a(3)
+      x2r = a(4) + a(6)
+      x2i = a(5) + a(7)
+      x3r = a(4) - a(6)
+      x3i = a(5) - a(7)
+      a(0) = x0r + x2r
+      a(1) = x0i + x2i
+      a(4) = x0r - x2r
+      a(5) = x0i - x2i
+      a(2) = x1r - x3i
+      a(3) = x1i + x3r
+      a(6) = x1r + x3i
+      a(7) = x1i - x3r
+      wk1r = w(2)
+      x0r = a(8) + a(10)
+      x0i = a(9) + a(11)
+      x1r = a(8) - a(10)
+      x1i = a(9) - a(11)
+      x2r = a(12) + a(14)
+      x2i = a(13) + a(15)
+      x3r = a(12) - a(14)
+      x3i = a(13) - a(15)
+      a(8) = x0r + x2r
+      a(9) = x0i + x2i
+      a(12) = x2i - x0i
+      a(13) = x0r - x2r
+      x0r = x1r - x3i
+      x0i = x1i + x3r
+      a(10) = wk1r * (x0r - x0i)
+      a(11) = wk1r * (x0r + x0i)
+      x0r = x3i + x1r
+      x0i = x3r - x1i
+      a(14) = wk1r * (x0i - x0r)
+      a(15) = wk1r * (x0i + x0r)
+      k1 = 0
+      do j = 16, n - 16, 16
+          k1 = k1 + 2
+          k2 = 2 * k1
+          wk2r = w(k1)
+          wk2i = w(k1 + 1)
+          wk1r = w(k2)
+          wk1i = w(k2 + 1)
+          wk3r = wk1r - 2 * wk2i * wk1i
+          wk3i = 2 * wk2i * wk1r - wk1i
+          x0r = a(j) + a(j + 2)
+          x0i = a(j + 1) + a(j + 3)
+          x1r = a(j) - a(j + 2)
+          x1i = a(j + 1) - a(j + 3)
+          x2r = a(j + 4) + a(j + 6)
+          x2i = a(j + 5) + a(j + 7)
+          x3r = a(j + 4) - a(j + 6)
+          x3i = a(j + 5) - a(j + 7)
+          a(j) = x0r + x2r
+          a(j + 1) = x0i + x2i
+          x0r = x0r - x2r
+          x0i = x0i - x2i
+          a(j + 4) = wk2r * x0r - wk2i * x0i
+          a(j + 5) = wk2r * x0i + wk2i * x0r
+          x0r = x1r - x3i
+          x0i = x1i + x3r
+          a(j + 2) = wk1r * x0r - wk1i * x0i
+          a(j + 3) = wk1r * x0i + wk1i * x0r
+          x0r = x1r + x3i
+          x0i = x1i - x3r
+          a(j + 6) = wk3r * x0r - wk3i * x0i
+          a(j + 7) = wk3r * x0i + wk3i * x0r
+          wk1r = w(k2 + 2)
+          wk1i = w(k2 + 3)
+          wk3r = wk1r - 2 * wk2r * wk1i
+          wk3i = 2 * wk2r * wk1r - wk1i
+          x0r = a(j + 8) + a(j + 10)
+          x0i = a(j + 9) + a(j + 11)
+          x1r = a(j + 8) - a(j + 10)
+          x1i = a(j + 9) - a(j + 11)
+          x2r = a(j + 12) + a(j + 14)
+          x2i = a(j + 13) + a(j + 15)
+          x3r = a(j + 12) - a(j + 14)
+          x3i = a(j + 13) - a(j + 15)
+          a(j + 8) = x0r + x2r
+          a(j + 9) = x0i + x2i
+          x0r = x0r - x2r
+          x0i = x0i - x2i
+          a(j + 12) = -wk2i * x0r - wk2r * x0i
+          a(j + 13) = -wk2i * x0i + wk2r * x0r
+          x0r = x1r - x3i
+          x0i = x1i + x3r
+          a(j + 10) = wk1r * x0r - wk1i * x0i
+          a(j + 11) = wk1r * x0i + wk1i * x0r
+          x0r = x1r + x3i
+          x0i = x1i - x3r
+          a(j + 14) = wk3r * x0r - wk3i * x0i
+          a(j + 15) = wk3r * x0i + wk3i * x0r
+      end do
+      end
+!
+      subroutine cftmdl(n, l, a, w)
+      integer n, l, j, j1, j2, j3, k, k1, k2, m, m2
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      m = 4 * l
+      do j = 0, l - 2, 2
+          j1 = j + l
+          j2 = j1 + l
+          j3 = j2 + l
+          x0r = a(j) + a(j1)
+          x0i = a(j + 1) + a(j1 + 1)
+          x1r = a(j) - a(j1)
+          x1i = a(j + 1) - a(j1 + 1)
+          x2r = a(j2) + a(j3)
+          x2i = a(j2 + 1) + a(j3 + 1)
+          x3r = a(j2) - a(j3)
+          x3i = a(j2 + 1) - a(j3 + 1)
+          a(j) = x0r + x2r
+          a(j + 1) = x0i + x2i
+          a(j2) = x0r - x2r
+          a(j2 + 1) = x0i - x2i
+          a(j1) = x1r - x3i
+          a(j1 + 1) = x1i + x3r
+          a(j3) = x1r + x3i
+          a(j3 + 1) = x1i - x3r
+      end do
+      wk1r = w(2)
+      do j = m, l + m - 2, 2
+          j1 = j + l
+          j2 = j1 + l
+          j3 = j2 + l
+          x0r = a(j) + a(j1)
+          x0i = a(j + 1) + a(j1 + 1)
+          x1r = a(j) - a(j1)
+          x1i = a(j + 1) - a(j1 + 1)
+          x2r = a(j2) + a(j3)
+          x2i = a(j2 + 1) + a(j3 + 1)
+          x3r = a(j2) - a(j3)
+          x3i = a(j2 + 1) - a(j3 + 1)
+          a(j) = x0r + x2r
+          a(j + 1) = x0i + x2i
+          a(j2) = x2i - x0i
+          a(j2 + 1) = x0r - x2r
+          x0r = x1r - x3i
+          x0i = x1i + x3r
+          a(j1) = wk1r * (x0r - x0i)
+          a(j1 + 1) = wk1r * (x0r + x0i)
+          x0r = x3i + x1r
+          x0i = x3r - x1i
+          a(j3) = wk1r * (x0i - x0r)
+          a(j3 + 1) = wk1r * (x0i + x0r)
+      end do
+      k1 = 0
+      m2 = 2 * m
+      do k = m2, n - m2, m2
+          k1 = k1 + 2
+          k2 = 2 * k1
+          wk2r = w(k1)
+          wk2i = w(k1 + 1)
+          wk1r = w(k2)
+          wk1i = w(k2 + 1)
+          wk3r = wk1r - 2 * wk2i * wk1i
+          wk3i = 2 * wk2i * wk1r - wk1i
+          do j = k, l + k - 2, 2
+              j1 = j + l
+              j2 = j1 + l
+              j3 = j2 + l
+              x0r = a(j) + a(j1)
+              x0i = a(j + 1) + a(j1 + 1)
+              x1r = a(j) - a(j1)
+              x1i = a(j + 1) - a(j1 + 1)
+              x2r = a(j2) + a(j3)
+              x2i = a(j2 + 1) + a(j3 + 1)
+              x3r = a(j2) - a(j3)
+              x3i = a(j2 + 1) - a(j3 + 1)
+              a(j) = x0r + x2r
+              a(j + 1) = x0i + x2i
+              x0r = x0r - x2r
+              x0i = x0i - x2i
+              a(j2) = wk2r * x0r - wk2i * x0i
+              a(j2 + 1) = wk2r * x0i + wk2i * x0r
+              x0r = x1r - x3i
+              x0i = x1i + x3r
+              a(j1) = wk1r * x0r - wk1i * x0i
+              a(j1 + 1) = wk1r * x0i + wk1i * x0r
+              x0r = x1r + x3i
+              x0i = x1i - x3r
+              a(j3) = wk3r * x0r - wk3i * x0i
+              a(j3 + 1) = wk3r * x0i + wk3i * x0r
+          end do
+          wk1r = w(k2 + 2)
+          wk1i = w(k2 + 3)
+          wk3r = wk1r - 2 * wk2r * wk1i
+          wk3i = 2 * wk2r * wk1r - wk1i
+          do j = k + m, l + (k + m) - 2, 2
+              j1 = j + l
+              j2 = j1 + l
+              j3 = j2 + l
+              x0r = a(j) + a(j1)
+              x0i = a(j + 1) + a(j1 + 1)
+              x1r = a(j) - a(j1)
+              x1i = a(j + 1) - a(j1 + 1)
+              x2r = a(j2) + a(j3)
+              x2i = a(j2 + 1) + a(j3 + 1)
+              x3r = a(j2) - a(j3)
+              x3i = a(j2 + 1) - a(j3 + 1)
+              a(j) = x0r + x2r
+              a(j + 1) = x0i + x2i
+              x0r = x0r - x2r
+              x0i = x0i - x2i
+              a(j2) = -wk2i * x0r - wk2r * x0i
+              a(j2 + 1) = -wk2i * x0i + wk2r * x0r
+              x0r = x1r - x3i
+              x0i = x1i + x3r
+              a(j1) = wk1r * x0r - wk1i * x0i
+              a(j1 + 1) = wk1r * x0i + wk1i * x0r
+              x0r = x1r + x3i
+              x0i = x1i - x3r
+              a(j3) = wk3r * x0r - wk3i * x0i
+              a(j3 + 1) = wk3r * x0i + wk3i * x0r
+          end do
+      end do
+      end
+!
+      subroutine rftfsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr, xi, yr, yi
+      m = n / 2
+      ks = 2 * nc / m
+      kk = 0
+      do j = 2, m - 2, 2
+          k = n - j
+          kk = kk + ks
+          wkr = 0.5d0 - c(nc - kk)
+          wki = c(kk)
+          xr = a(j) - a(k)
+          xi = a(j + 1) + a(k + 1)
+          yr = wkr * xr - wki * xi
+          yi = wkr * xi + wki * xr
+          a(j) = a(j) - yr
+          a(j + 1) = a(j + 1) - yi
+          a(k) = a(k) + yr
+          a(k + 1) = a(k + 1) - yi
+      end do
+      end
+!
+      subroutine rftbsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr, xi, yr, yi
+      a(1) = -a(1)
+      m = n / 2
+      ks = 2 * nc / m
+      kk = 0
+      do j = 2, m - 2, 2
+          k = n - j
+          kk = kk + ks
+          wkr = 0.5d0 - c(nc - kk)
+          wki = c(kk)
+          xr = a(j) - a(k)
+          xi = a(j + 1) + a(k + 1)
+          yr = wkr * xr + wki * xi
+          yi = wkr * xi - wki * xr
+          a(j) = a(j) - yr
+          a(j + 1) = yi - a(j + 1)
+          a(k) = a(k) + yr
+          a(k + 1) = yi - a(k + 1)
+      end do
+      a(m + 1) = -a(m + 1)
+      end
+!
+      subroutine dctsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr
+      m = n / 2
+      ks = nc / n
+      kk = 0
+      do j = 1, m - 1
+          k = n - j
+          kk = kk + ks
+          wkr = c(kk) - c(nc - kk)
+          wki = c(kk) + c(nc - kk)
+          xr = wki * a(j) - wkr * a(k)
+          a(j) = wkr * a(j) + wki * a(k)
+          a(k) = xr
+      end do
+      a(m) = c(0) * a(m)
+      end
+!
+      subroutine dstsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr
+      m = n / 2
+      ks = nc / n
+      kk = 0
+      do j = 1, m - 1
+          k = n - j
+          kk = kk + ks
+          wkr = c(kk) - c(nc - kk)
+          wki = c(kk) + c(nc - kk)
+          xr = wki * a(k) - wkr * a(j)
+          a(k) = wkr * a(k) + wki * a(j)
+          a(j) = xr
+      end do
+      a(m) = c(0) * a(m)
+      end
+!
diff --git a/fft4g_h.c b/fft4g_h.c
new file mode 100644
index 0000000..1064285
--- /dev/null
+++ b/fft4g_h.c
@@ -0,0 +1,1368 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :4, 2
+    data        :inplace
+    table       :not use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, double *);
+    void rdft(int, int, double *);
+    void ddct(int, int, double *);
+    void ddst(int, int, double *);
+    void dfct(int, double *);
+    void dfst(int, double *);
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            cdft(2*n, 1, a);
+        <case2>
+            cdft(2*n, -1, a);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (double *)
+                        input data
+                            a[2*j] = Re(x[j]), 
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]), 
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+    [remark]
+        Inverse of 
+            cdft(2*n, -1, a);
+        is 
+            cdft(2*n, 1, a);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + 
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + 
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            rdft(n, 1, a);
+        <case2>
+            rdft(n, -1, a);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+    [remark]
+        Inverse of 
+            rdft(n, 1, a);
+        is 
+            rdft(n, -1, a);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ddct(n, 1, a);
+        <case2>
+            ddct(n, -1, a);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+    [remark]
+        Inverse of 
+            ddct(n, -1, a);
+        is 
+            a[0] *= 0.5;
+            ddct(n, 1, a);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ddst(n, 1, a);
+        <case2>
+            ddst(n, -1, a);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+    [remark]
+        Inverse of 
+            ddst(n, -1, a);
+        is 
+            a[0] *= 0.5;
+            ddst(n, 1, a);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        dfct(n, a);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+    [remark]
+        Inverse of 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a);
+        is 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        dfst(n, a);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+    [remark]
+        Inverse of 
+            dfst(n, a);
+        is 
+            dfst(n, a);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+*/
+
+
+void cdft(int n, int isgn, double *a)
+{
+    void bitrv2(int n, double *a);
+    void bitrv2conj(int n, double *a);
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    
+    if (n > 4) {
+        if (isgn >= 0) {
+            bitrv2(n, a);
+            cftfsub(n, a);
+        } else {
+            bitrv2conj(n, a);
+            cftbsub(n, a);
+        }
+    } else if (n == 4) {
+        cftfsub(n, a);
+    }
+}
+
+
+void rdft(int n, int isgn, double *a)
+{
+    void bitrv2(int n, double *a);
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    void rftfsub(int n, double *a);
+    void rftbsub(int n, double *a);
+    double xi;
+    
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, a);
+            cftfsub(n, a);
+            rftfsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+        xi = a[0] - a[1];
+        a[0] += a[1];
+        a[1] = xi;
+    } else {
+        a[1] = 0.5 * (a[0] - a[1]);
+        a[0] -= a[1];
+        if (n > 4) {
+            rftbsub(n, a);
+            bitrv2(n, a);
+            cftbsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+    }
+}
+
+
+void ddct(int n, int isgn, double *a)
+{
+    void bitrv2(int n, double *a);
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    void rftfsub(int n, double *a);
+    void rftbsub(int n, double *a);
+    void dctsub(int n, double *a);
+    void dctsub4(int n, double *a);
+    int j;
+    double xr;
+    
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = a[j] - a[j - 1];
+            a[j] += a[j - 1];
+        }
+        a[1] = a[0] - xr;
+        a[0] += xr;
+        if (n > 4) {
+            rftbsub(n, a);
+            bitrv2(n, a);
+            cftbsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+    }
+    if (n > 4) {
+        dctsub(n, a);
+    } else {
+        dctsub4(n, a);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, a);
+            cftfsub(n, a);
+            rftfsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = a[j] - a[j + 1];
+            a[j] += a[j + 1];
+        }
+        a[n - 1] = xr;
+    }
+}
+
+
+void ddst(int n, int isgn, double *a)
+{
+    void bitrv2(int n, double *a);
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    void rftfsub(int n, double *a);
+    void rftbsub(int n, double *a);
+    void dstsub(int n, double *a);
+    void dstsub4(int n, double *a);
+    int j;
+    double xr;
+    
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = -a[j] - a[j - 1];
+            a[j] -= a[j - 1];
+        }
+        a[1] = a[0] + xr;
+        a[0] -= xr;
+        if (n > 4) {
+            rftbsub(n, a);
+            bitrv2(n, a);
+            cftbsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+    }
+    if (n > 4) {
+        dstsub(n, a);
+    } else {
+        dstsub4(n, a);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, a);
+            cftfsub(n, a);
+            rftfsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = -a[j] - a[j + 1];
+            a[j] -= a[j + 1];
+        }
+        a[n - 1] = -xr;
+    }
+}
+
+
+void dfct(int n, double *a)
+{
+    void ddct(int n, int isgn, double *a);
+    void bitrv1(int n, double *a);
+    int j, k, m, mh;
+    double xr, xi, yr, yi, an;
+    
+    m = n >> 1;
+    for (j = 0; j < m; j++) {
+        k = n - j;
+        xr = a[j] + a[k];
+        a[j] -= a[k];
+        a[k] = xr;
+    }
+    an = a[n];
+    while (m >= 2) {
+        ddct(m, 1, a);
+        bitrv1(m, a);
+        mh = m >> 1;
+        xi = a[m];
+        a[m] = a[0];
+        a[0] = an - xi;
+        an += xi;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[m + k];
+            xi = a[m + j];
+            yr = a[j];
+            yi = a[k];
+            a[m + j] = yr;
+            a[m + k] = yi;
+            a[j] = xr - xi;
+            a[k] = xr + xi;
+        }
+        xr = a[mh];
+        a[mh] = a[m + mh];
+        a[m + mh] = xr;
+        m = mh;
+    }
+    xi = a[1];
+    a[1] = a[0];
+    a[0] = an + xi;
+    a[n] = an - xi;
+    bitrv1(n, a);
+}
+
+
+void dfst(int n, double *a)
+{
+    void ddst(int n, int isgn, double *a);
+    void bitrv1(int n, double *a);
+    int j, k, m, mh;
+    double xr, xi, yr, yi;
+    
+    m = n >> 1;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        xr = a[j] - a[k];
+        a[j] += a[k];
+        a[k] = xr;
+    }
+    a[0] = a[m];
+    while (m >= 2) {
+        ddst(m, 1, a);
+        bitrv1(m, a);
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[m + k];
+            xi = a[m + j];
+            yr = a[j];
+            yi = a[k];
+            a[m + j] = yr;
+            a[m + k] = yi;
+            a[j] = xr + xi;
+            a[k] = xr - xi;
+        }
+        a[m] = a[0];
+        a[0] = a[m + mh];
+        a[m + mh] = a[mh];
+        m = mh;
+    }
+    a[1] = a[0];
+    a[0] = 0;
+    bitrv1(n, a);
+}
+
+
+/* -------- child routines -------- */
+
+
+#include <math.h>
+#ifndef M_PI_2
+#define M_PI_2      1.570796326794896619231321691639751442098584699687
+#endif
+#ifndef WR5000  /* cos(M_PI_2*0.5000) */
+#define WR5000      0.707106781186547524400844362104849039284835937688
+#endif
+#ifndef WR2500  /* cos(M_PI_2*0.2500) */
+#define WR2500      0.923879532511286756128183189396788286822416625863
+#endif
+#ifndef WI2500  /* sin(M_PI_2*0.2500) */
+#define WI2500      0.382683432365089771728459984030398866761344562485
+#endif
+
+
+#ifndef RDFT_LOOP_DIV  /* control of the RDFT's speed & tolerance */
+#define RDFT_LOOP_DIV 64
+#endif
+
+#ifndef DCST_LOOP_DIV  /* control of the DCT,DST's speed & tolerance */
+#define DCST_LOOP_DIV 64
+#endif
+
+
+void bitrv2(int n, double *a)
+{
+    int j0, k0, j1, k1, l, m, i, j, k;
+    double xr, xi, yr, yi;
+    
+    l = n >> 2;
+    m = 2;
+    while (m < l) {
+        l >>= 1;
+        m <<= 1;
+    }
+    if (m == l) {
+        j0 = 0;
+        for (k0 = 0; k0 < m; k0 += 2) {
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 2) {
+                xr = a[j];
+                xi = a[j + 1];
+                yr = a[k];
+                yi = a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 -= m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+            j1 = j0 + k0 + m;
+            k1 = j1 + m;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+        }
+    } else {
+        j0 = 0;
+        for (k0 = 2; k0 < m; k0 += 2) {
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 2) {
+                xr = a[j];
+                xi = a[j + 1];
+                yr = a[k];
+                yi = a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+        }
+    }
+}
+
+
+void bitrv2conj(int n, double *a)
+{
+    int j0, k0, j1, k1, l, m, i, j, k;
+    double xr, xi, yr, yi;
+    
+    l = n >> 2;
+    m = 2;
+    while (m < l) {
+        l >>= 1;
+        m <<= 1;
+    }
+    if (m == l) {
+        j0 = 0;
+        for (k0 = 0; k0 < m; k0 += 2) {
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 2) {
+                xr = a[j];
+                xi = -a[j + 1];
+                yr = a[k];
+                yi = -a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 -= m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+            k1 = j0 + k0;
+            a[k1 + 1] = -a[k1 + 1];
+            j1 = k1 + m;
+            k1 = j1 + m;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            k1 += m;
+            a[k1 + 1] = -a[k1 + 1];
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+        }
+    } else {
+        a[1] = -a[1];
+        a[m + 1] = -a[m + 1];
+        j0 = 0;
+        for (k0 = 2; k0 < m; k0 += 2) {
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 2) {
+                xr = a[j];
+                xi = -a[j + 1];
+                yr = a[k];
+                yi = -a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+            k1 = j0 + k0;
+            a[k1 + 1] = -a[k1 + 1];
+            a[k1 + m + 1] = -a[k1 + m + 1];
+        }
+    }
+}
+
+
+void bitrv1(int n, double *a)
+{
+    int j0, k0, j1, k1, l, m, i, j, k;
+    double x;
+    
+    l = n >> 2;
+    m = 1;
+    while (m < l) {
+        l >>= 1;
+        m <<= 1;
+    }
+    if (m == l) {
+        j0 = 0;
+        for (k0 = 0; k0 < m; k0++) {
+            k = k0;
+            for (j = j0; j < j0 + k0; j++) {
+                x = a[j];
+                a[j] = a[k];
+                a[k] = x;
+                j1 = j + m;
+                k1 = k + 2 * m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1 += m;
+                k1 -= m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1 += m;
+                k1 += 2 * m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+            j1 = j0 + k0 + m;
+            k1 = j1 + m;
+            x = a[j1];
+            a[j1] = a[k1];
+            a[k1] = x;
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+        }
+    } else {
+        j0 = 0;
+        for (k0 = 1; k0 < m; k0++) {
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+            k = k0;
+            for (j = j0; j < j0 + k0; j++) {
+                x = a[j];
+                a[j] = a[k];
+                a[k] = x;
+                j1 = j + m;
+                k1 = k + m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+        }
+    }
+}
+
+
+void cftfsub(int n, double *a)
+{
+    void cft1st(int n, double *a);
+    void cftmdl(int n, int l, double *a);
+    int j, j1, j2, j3, l;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    l = 2;
+    if (n > 8) {
+        cft1st(n, a);
+        l = 8;
+        while ((l << 2) < n) {
+            cftmdl(n, l, a);
+            l <<= 2;
+        }
+    }
+    if ((l << 2) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i - x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i + x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i - x3r;
+        }
+    } else {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = a[j + 1] - a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] += a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+void cftbsub(int n, double *a)
+{
+    void cft1st(int n, double *a);
+    void cftmdl(int n, int l, double *a);
+    int j, j1, j2, j3, l;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    l = 2;
+    if (n > 8) {
+        cft1st(n, a);
+        l = 8;
+        while ((l << 2) < n) {
+            cftmdl(n, l, a);
+            l <<= 2;
+        }
+    }
+    if ((l << 2) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = -a[j + 1] - a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = -a[j + 1] + a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i - x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i + x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i - x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i + x3r;
+        }
+    } else {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = -a[j + 1] + a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] = -a[j + 1] - a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+void cft1st(int n, double *a)
+{
+    int j, kj, kr;
+    double ew, wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    x0r = a[0] + a[2];
+    x0i = a[1] + a[3];
+    x1r = a[0] - a[2];
+    x1i = a[1] - a[3];
+    x2r = a[4] + a[6];
+    x2i = a[5] + a[7];
+    x3r = a[4] - a[6];
+    x3i = a[5] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[2] = x1r - x3i;
+    a[3] = x1i + x3r;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+    wn4r = WR5000;
+    x0r = a[8] + a[10];
+    x0i = a[9] + a[11];
+    x1r = a[8] - a[10];
+    x1i = a[9] - a[11];
+    x2r = a[12] + a[14];
+    x2i = a[13] + a[15];
+    x3r = a[12] - a[14];
+    x3i = a[13] - a[15];
+    a[8] = x0r + x2r;
+    a[9] = x0i + x2i;
+    a[12] = x2i - x0i;
+    a[13] = x0r - x2r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[10] = wn4r * (x0r - x0i);
+    a[11] = wn4r * (x0r + x0i);
+    x0r = x3i + x1r;
+    x0i = x3r - x1i;
+    a[14] = wn4r * (x0i - x0r);
+    a[15] = wn4r * (x0i + x0r);
+    ew = M_PI_2 / n;
+    kr = 0;
+    for (j = 16; j < n; j += 16) {
+        for (kj = n >> 2; kj > (kr ^= kj); kj >>= 1);
+        wk1r = cos(ew * kr);
+        wk1i = sin(ew * kr);
+        wk2r = 1 - 2 * wk1i * wk1i;
+        wk2i = 2 * wk1i * wk1r;
+        wk3r = wk1r - 2 * wk2i * wk1i;
+        wk3i = 2 * wk2i * wk1r - wk1i;
+        x0r = a[j] + a[j + 2];
+        x0i = a[j + 1] + a[j + 3];
+        x1r = a[j] - a[j + 2];
+        x1i = a[j + 1] - a[j + 3];
+        x2r = a[j + 4] + a[j + 6];
+        x2i = a[j + 5] + a[j + 7];
+        x3r = a[j + 4] - a[j + 6];
+        x3i = a[j + 5] - a[j + 7];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        x0r -= x2r;
+        x0i -= x2i;
+        a[j + 4] = wk2r * x0r - wk2i * x0i;
+        a[j + 5] = wk2r * x0i + wk2i * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j + 2] = wk1r * x0r - wk1i * x0i;
+        a[j + 3] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j + 6] = wk3r * x0r - wk3i * x0i;
+        a[j + 7] = wk3r * x0i + wk3i * x0r;
+        x0r = wn4r * (wk1r - wk1i);
+        wk1i = wn4r * (wk1r + wk1i);
+        wk1r = x0r;
+        wk3r = wk1r - 2 * wk2r * wk1i;
+        wk3i = 2 * wk2r * wk1r - wk1i;
+        x0r = a[j + 8] + a[j + 10];
+        x0i = a[j + 9] + a[j + 11];
+        x1r = a[j + 8] - a[j + 10];
+        x1i = a[j + 9] - a[j + 11];
+        x2r = a[j + 12] + a[j + 14];
+        x2i = a[j + 13] + a[j + 15];
+        x3r = a[j + 12] - a[j + 14];
+        x3i = a[j + 13] - a[j + 15];
+        a[j + 8] = x0r + x2r;
+        a[j + 9] = x0i + x2i;
+        x0r -= x2r;
+        x0i -= x2i;
+        a[j + 12] = -wk2i * x0r - wk2r * x0i;
+        a[j + 13] = -wk2i * x0i + wk2r * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j + 10] = wk1r * x0r - wk1i * x0i;
+        a[j + 11] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j + 14] = wk3r * x0r - wk3i * x0i;
+        a[j + 15] = wk3r * x0i + wk3i * x0r;
+    }
+}
+
+
+void cftmdl(int n, int l, double *a)
+{
+    int j, j1, j2, j3, k, kj, kr, m, m2;
+    double ew, wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    m = l << 2;
+    for (j = 0; j < l; j += 2) {
+        j1 = j + l;
+        j2 = j1 + l;
+        j3 = j2 + l;
+        x0r = a[j] + a[j1];
+        x0i = a[j + 1] + a[j1 + 1];
+        x1r = a[j] - a[j1];
+        x1i = a[j + 1] - a[j1 + 1];
+        x2r = a[j2] + a[j3];
+        x2i = a[j2 + 1] + a[j3 + 1];
+        x3r = a[j2] - a[j3];
+        x3i = a[j2 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j2] = x0r - x2r;
+        a[j2 + 1] = x0i - x2i;
+        a[j1] = x1r - x3i;
+        a[j1 + 1] = x1i + x3r;
+        a[j3] = x1r + x3i;
+        a[j3 + 1] = x1i - x3r;
+    }
+    wn4r = WR5000;
+    for (j = m; j < l + m; j += 2) {
+        j1 = j + l;
+        j2 = j1 + l;
+        j3 = j2 + l;
+        x0r = a[j] + a[j1];
+        x0i = a[j + 1] + a[j1 + 1];
+        x1r = a[j] - a[j1];
+        x1i = a[j + 1] - a[j1 + 1];
+        x2r = a[j2] + a[j3];
+        x2i = a[j2 + 1] + a[j3 + 1];
+        x3r = a[j2] - a[j3];
+        x3i = a[j2 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j2] = x2i - x0i;
+        a[j2 + 1] = x0r - x2r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j1] = wn4r * (x0r - x0i);
+        a[j1 + 1] = wn4r * (x0r + x0i);
+        x0r = x3i + x1r;
+        x0i = x3r - x1i;
+        a[j3] = wn4r * (x0i - x0r);
+        a[j3 + 1] = wn4r * (x0i + x0r);
+    }
+    ew = M_PI_2 / n;
+    kr = 0;
+    m2 = 2 * m;
+    for (k = m2; k < n; k += m2) {
+        for (kj = n >> 2; kj > (kr ^= kj); kj >>= 1);
+        wk1r = cos(ew * kr);
+        wk1i = sin(ew * kr);
+        wk2r = 1 - 2 * wk1i * wk1i;
+        wk2i = 2 * wk1i * wk1r;
+        wk3r = wk1r - 2 * wk2i * wk1i;
+        wk3i = 2 * wk2i * wk1r - wk1i;
+        for (j = k; j < l + k; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            x0r -= x2r;
+            x0i -= x2i;
+            a[j2] = wk2r * x0r - wk2i * x0i;
+            a[j2 + 1] = wk2r * x0i + wk2i * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j1] = wk1r * x0r - wk1i * x0i;
+            a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3r * x0r - wk3i * x0i;
+            a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+        }
+        x0r = wn4r * (wk1r - wk1i);
+        wk1i = wn4r * (wk1r + wk1i);
+        wk1r = x0r;
+        wk3r = wk1r - 2 * wk2r * wk1i;
+        wk3i = 2 * wk2r * wk1r - wk1i;
+        for (j = k + m; j < l + (k + m); j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            x0r -= x2r;
+            x0i -= x2i;
+            a[j2] = -wk2i * x0r - wk2r * x0i;
+            a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j1] = wk1r * x0r - wk1i * x0i;
+            a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3r * x0r - wk3i * x0i;
+            a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+        }
+    }
+}
+
+
+void rftfsub(int n, double *a)
+{
+    int i, i0, j, k;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = 2 * M_PI_2 / n;
+    wkr = 0;
+    wki = 0;
+    wdi = cos(ec);
+    wdr = sin(ec);
+    wdi *= wdr;
+    wdr *= wdr;
+    w1r = 1 - 2 * wdr;
+    w1i = 2 * wdi;
+    ss = 2 * w1i;
+    i = n >> 1;
+    for (;;) {
+        i0 = i - 4 * RDFT_LOOP_DIV;
+        if (i0 < 4) {
+            i0 = 4;
+        }
+        for (j = i - 4; j >= i0; j -= 4) {
+            k = n - j;
+            xr = a[j + 2] - a[k - 2];
+            xi = a[j + 3] + a[k - 1];
+            yr = wdr * xr - wdi * xi;
+            yi = wdr * xi + wdi * xr;
+            a[j + 2] -= yr;
+            a[j + 3] -= yi;
+            a[k - 2] += yr;
+            a[k - 1] -= yi;
+            wkr += ss * wdi;
+            wki += ss * (0.5 - wdr);
+            xr = a[j] - a[k];
+            xi = a[j + 1] + a[k + 1];
+            yr = wkr * xr - wki * xi;
+            yi = wkr * xi + wki * xr;
+            a[j] -= yr;
+            a[j + 1] -= yi;
+            a[k] += yr;
+            a[k + 1] -= yi;
+            wdr += ss * wki;
+            wdi += ss * (0.5 - wkr);
+        }
+        if (i0 == 4) {
+            break;
+        }
+        wkr = 0.5 * sin(ec * i0);
+        wki = 0.5 * cos(ec * i0);
+        wdr = 0.5 - (wkr * w1r - wki * w1i);
+        wdi = wkr * w1i + wki * w1r;
+        wkr = 0.5 - wkr;
+        i = i0;
+    }
+    xr = a[2] - a[n - 2];
+    xi = a[3] + a[n - 1];
+    yr = wdr * xr - wdi * xi;
+    yi = wdr * xi + wdi * xr;
+    a[2] -= yr;
+    a[3] -= yi;
+    a[n - 2] += yr;
+    a[n - 1] -= yi;
+}
+
+
+void rftbsub(int n, double *a)
+{
+    int i, i0, j, k;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = 2 * M_PI_2 / n;
+    wkr = 0;
+    wki = 0;
+    wdi = cos(ec);
+    wdr = sin(ec);
+    wdi *= wdr;
+    wdr *= wdr;
+    w1r = 1 - 2 * wdr;
+    w1i = 2 * wdi;
+    ss = 2 * w1i;
+    i = n >> 1;
+    a[i + 1] = -a[i + 1];
+    for (;;) {
+        i0 = i - 4 * RDFT_LOOP_DIV;
+        if (i0 < 4) {
+            i0 = 4;
+        }
+        for (j = i - 4; j >= i0; j -= 4) {
+            k = n - j;
+            xr = a[j + 2] - a[k - 2];
+            xi = a[j + 3] + a[k - 1];
+            yr = wdr * xr + wdi * xi;
+            yi = wdr * xi - wdi * xr;
+            a[j + 2] -= yr;
+            a[j + 3] = yi - a[j + 3];
+            a[k - 2] += yr;
+            a[k - 1] = yi - a[k - 1];
+            wkr += ss * wdi;
+            wki += ss * (0.5 - wdr);
+            xr = a[j] - a[k];
+            xi = a[j + 1] + a[k + 1];
+            yr = wkr * xr + wki * xi;
+            yi = wkr * xi - wki * xr;
+            a[j] -= yr;
+            a[j + 1] = yi - a[j + 1];
+            a[k] += yr;
+            a[k + 1] = yi - a[k + 1];
+            wdr += ss * wki;
+            wdi += ss * (0.5 - wkr);
+        }
+        if (i0 == 4) {
+            break;
+        }
+        wkr = 0.5 * sin(ec * i0);
+        wki = 0.5 * cos(ec * i0);
+        wdr = 0.5 - (wkr * w1r - wki * w1i);
+        wdi = wkr * w1i + wki * w1r;
+        wkr = 0.5 - wkr;
+        i = i0;
+    }
+    xr = a[2] - a[n - 2];
+    xi = a[3] + a[n - 1];
+    yr = wdr * xr + wdi * xi;
+    yi = wdr * xi - wdi * xr;
+    a[2] -= yr;
+    a[3] = yi - a[3];
+    a[n - 2] += yr;
+    a[n - 1] = yi - a[n - 1];
+    a[1] = -a[1];
+}
+
+
+void dctsub(int n, double *a)
+{
+    int i, i0, j, k, m;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = M_PI_2 / n;
+    wkr = 0.5;
+    wki = 0.5;
+    w1r = cos(ec);
+    w1i = sin(ec);
+    wdr = 0.5 * (w1r - w1i);
+    wdi = 0.5 * (w1r + w1i);
+    ss = 2 * w1i;
+    m = n >> 1;
+    i = 0;
+    for (;;) {
+        i0 = i + 2 * DCST_LOOP_DIV;
+        if (i0 > m - 2) {
+            i0 = m - 2;
+        }
+        for (j = i + 2; j <= i0; j += 2) {
+            k = n - j;
+            xr = wdi * a[j - 1] - wdr * a[k + 1];
+            xi = wdr * a[j - 1] + wdi * a[k + 1];
+            wkr -= ss * wdi;
+            wki += ss * wdr;
+            yr = wki * a[j] - wkr * a[k];
+            yi = wkr * a[j] + wki * a[k];
+            wdr -= ss * wki;
+            wdi += ss * wkr;
+            a[k + 1] = xr;
+            a[k] = yr;
+            a[j - 1] = xi;
+            a[j] = yi;
+        }
+        if (i0 == m - 2) {
+            break;
+        }
+        wdr = cos(ec * i0);
+        wdi = sin(ec * i0);
+        wkr = 0.5 * (wdr - wdi);
+        wki = 0.5 * (wdr + wdi);
+        wdr = wkr * w1r - wki * w1i;
+        wdi = wkr * w1i + wki * w1r;
+        i = i0;
+    }
+    xr = wdi * a[m - 1] - wdr * a[m + 1];
+    a[m - 1] = wdr * a[m - 1] + wdi * a[m + 1];
+    a[m + 1] = xr;
+    a[m] *= wki + ss * wdr;
+}
+
+
+void dstsub(int n, double *a)
+{
+    int i, i0, j, k, m;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = M_PI_2 / n;
+    wkr = 0.5;
+    wki = 0.5;
+    w1r = cos(ec);
+    w1i = sin(ec);
+    wdr = 0.5 * (w1r - w1i);
+    wdi = 0.5 * (w1r + w1i);
+    ss = 2 * w1i;
+    m = n >> 1;
+    i = 0;
+    for (;;) {
+        i0 = i + 2 * DCST_LOOP_DIV;
+        if (i0 > m - 2) {
+            i0 = m - 2;
+        }
+        for (j = i + 2; j <= i0; j += 2) {
+            k = n - j;
+            xr = wdi * a[k + 1] - wdr * a[j - 1];
+            xi = wdr * a[k + 1] + wdi * a[j - 1];
+            wkr -= ss * wdi;
+            wki += ss * wdr;
+            yr = wki * a[k] - wkr * a[j];
+            yi = wkr * a[k] + wki * a[j];
+            wdr -= ss * wki;
+            wdi += ss * wkr;
+            a[j - 1] = xr;
+            a[j] = yr;
+            a[k + 1] = xi;
+            a[k] = yi;
+        }
+        if (i0 == m - 2) {
+            break;
+        }
+        wdr = cos(ec * i0);
+        wdi = sin(ec * i0);
+        wkr = 0.5 * (wdr - wdi);
+        wki = 0.5 * (wdr + wdi);
+        wdr = wkr * w1r - wki * w1i;
+        wdi = wkr * w1i + wki * w1r;
+        i = i0;
+    }
+    xr = wdi * a[m + 1] - wdr * a[m - 1];
+    a[m + 1] = wdr * a[m + 1] + wdi * a[m - 1];
+    a[m - 1] = xr;
+    a[m] *= wki + ss * wdr;
+}
+
+
+void dctsub4(int n, double *a)
+{
+    int m;
+    double wki, wdr, wdi, xr;
+    
+    wki = WR5000;
+    m = n >> 1;
+    if (m == 2) {
+        wdr = wki * WI2500;
+        wdi = wki * WR2500;
+        xr = wdi * a[1] - wdr * a[3];
+        a[1] = wdr * a[1] + wdi * a[3];
+        a[3] = xr;
+    }
+    a[m] *= wki;
+}
+
+
+void dstsub4(int n, double *a)
+{
+    int m;
+    double wki, wdr, wdi, xr;
+    
+    wki = WR5000;
+    m = n >> 1;
+    if (m == 2) {
+        wdr = wki * WI2500;
+        wdi = wki * WR2500;
+        xr = wdi * a[3] - wdr * a[1];
+        a[3] = wdr * a[3] + wdi * a[1];
+        a[1] = xr;
+    }
+    a[m] *= wki;
+}
+
diff --git a/fft8g.c b/fft8g.c
new file mode 100644
index 0000000..99fcad5
--- /dev/null
+++ b/fft8g.c
@@ -0,0 +1,1642 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :8, 4, 2
+    data        :inplace
+    table       :use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, double *, int *, double *);
+    void rdft(int, int, double *, int *, double *);
+    void ddct(int, int, double *, int *, double *);
+    void ddst(int, int, double *, int *, double *);
+    void dfct(int, double *, double *, int *, double *);
+    void dfst(int, double *, double *, int *, double *);
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft(2*n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft(2*n, -1, a, ip, w);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (double *)
+                        input data
+                            a[2*j] = Re(x[j]), 
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]), 
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            cdft(2*n, -1, a, ip, w);
+        is 
+            cdft(2*n, 1, a, ip, w);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + 
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + 
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            rdft(n, 1, a, ip, w);
+        is 
+            rdft(n, -1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddct(n, -1, a, ip, w);
+        is 
+            a[0] *= 0.5;
+            ddct(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddst(n, -1, a, ip, w);
+        is 
+            a[0] *= 0.5;
+            ddst(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        ip[0] = 0; // first time only
+        dfct(n, a, t, ip, w);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+        t[0...n/2]     :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+        is 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        ip[0] = 0; // first time only
+        dfst(n, a, t, ip, w);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+        t[0...n/2-1]   :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            dfst(n, a, t, ip, w);
+        is 
+            dfst(n, a, t, ip, w);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+Appendix :
+    The cos/sin table is recalculated when the larger table required.
+    w[] and ip[] are compatible with all routines.
+*/
+
+
+void cdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void bitrv2(int n, int *ip, double *a);
+    void bitrv2conj(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void cftbsub(int n, double *a, double *w);
+    
+    if (n > (ip[0] << 2)) {
+        makewt(n >> 2, ip, w);
+    }
+    if (n > 4) {
+        if (isgn >= 0) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+        } else {
+            bitrv2conj(n, ip + 2, a);
+            cftbsub(n, a, w);
+        }
+    } else if (n == 4) {
+        cftfsub(n, a, w);
+    }
+}
+
+
+void rdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void cftbsub(int n, double *a, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    int nw, nc;
+    double xi;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 2)) {
+        nc = n >> 2;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xi = a[0] - a[1];
+        a[0] += a[1];
+        a[1] = xi;
+    } else {
+        a[1] = 0.5 * (a[0] - a[1]);
+        a[0] -= a[1];
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+}
+
+
+void ddct(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void cftbsub(int n, double *a, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    void dctsub(int n, double *a, int nc, double *c);
+    int j, nw, nc;
+    double xr;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = a[j] - a[j - 1];
+            a[j] += a[j - 1];
+        }
+        a[1] = a[0] - xr;
+        a[0] += xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+    dctsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = a[j] - a[j + 1];
+            a[j] += a[j + 1];
+        }
+        a[n - 1] = xr;
+    }
+}
+
+
+void ddst(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void cftbsub(int n, double *a, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    void dstsub(int n, double *a, int nc, double *c);
+    int j, nw, nc;
+    double xr;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = -a[j] - a[j - 1];
+            a[j] -= a[j - 1];
+        }
+        a[1] = a[0] + xr;
+        a[0] -= xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+    dstsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = -a[j] - a[j + 1];
+            a[j] -= a[j + 1];
+        }
+        a[n - 1] = -xr;
+    }
+}
+
+
+void dfct(int n, double *a, double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void dctsub(int n, double *a, int nc, double *c);
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+    
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    m = n >> 1;
+    yi = a[m];
+    xi = a[0] + a[n];
+    a[0] -= a[n];
+    t[0] = xi - yi;
+    t[m] = xi + yi;
+    if (n > 2) {
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] - a[n - j];
+            xi = a[j] + a[n - j];
+            yr = a[k] - a[n - k];
+            yi = a[k] + a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi - yi;
+            t[k] = xi + yi;
+        }
+        t[mh] = a[mh] + a[n - mh];
+        a[mh] -= a[n - mh];
+        dctsub(m, a, nc, w + nw);
+        if (m > 4) {
+            bitrv2(m, ip + 2, a);
+            cftfsub(m, a, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, w);
+        }
+        a[n - 1] = a[0] - a[1];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] + a[j + 1];
+            a[2 * j - 1] = a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dctsub(m, t, nc, w + nw);
+            if (m > 4) {
+                bitrv2(m, ip + 2, t);
+                cftfsub(m, t, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, w);
+            }
+            a[n - l] = t[0] - t[1];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = t[j] - t[j + 1];
+                a[k + l] = t[j] + t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 0; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] - t[m + j];
+                t[k] = t[m + k] + t[m + j];
+            }
+            t[mh] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+        a[n] = t[2] - t[1];
+        a[0] = t[2] + t[1];
+    } else {
+        a[1] = a[0];
+        a[2] = t[0];
+        a[0] = t[1];
+    }
+}
+
+
+void dfst(int n, double *a, double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void bitrv2(int n, int *ip, double *a);
+    void cftfsub(int n, double *a, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void dstsub(int n, double *a, int nc, double *c);
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+    
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    if (n > 2) {
+        m = n >> 1;
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] + a[n - j];
+            xi = a[j] - a[n - j];
+            yr = a[k] + a[n - k];
+            yi = a[k] - a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi + yi;
+            t[k] = xi - yi;
+        }
+        t[0] = a[mh] - a[n - mh];
+        a[mh] += a[n - mh];
+        a[0] = a[m];
+        dstsub(m, a, nc, w + nw);
+        if (m > 4) {
+            bitrv2(m, ip + 2, a);
+            cftfsub(m, a, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, w);
+        }
+        a[n - 1] = a[1] - a[0];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] - a[j + 1];
+            a[2 * j - 1] = -a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dstsub(m, t, nc, w + nw);
+            if (m > 4) {
+                bitrv2(m, ip + 2, t);
+                cftfsub(m, t, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, w);
+            }
+            a[n - l] = t[1] - t[0];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = -t[j] - t[j + 1];
+                a[k + l] = t[j] - t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 1; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] + t[m + j];
+                t[k] = t[m + k] - t[m + j];
+            }
+            t[0] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+    }
+    a[0] = 0;
+}
+
+
+/* -------- initializing routines -------- */
+
+
+#include <math.h>
+
+void makewt(int nw, int *ip, double *w)
+{
+    void bitrv2(int n, int *ip, double *a);
+    int j, nwh;
+    double delta, x, y;
+    
+    ip[0] = nw;
+    ip[1] = 1;
+    if (nw > 2) {
+        nwh = nw >> 1;
+        delta = atan(1.0) / nwh;
+        w[0] = 1;
+        w[1] = 0;
+        w[nwh] = cos(delta * nwh);
+        w[nwh + 1] = w[nwh];
+        if (nwh > 2) {
+            for (j = 2; j < nwh; j += 2) {
+                x = cos(delta * j);
+                y = sin(delta * j);
+                w[j] = x;
+                w[j + 1] = y;
+                w[nw - j] = y;
+                w[nw - j + 1] = x;
+            }
+            for (j = nwh - 2; j >= 2; j -= 2) {
+                x = w[2 * j];
+                y = w[2 * j + 1];
+                w[nwh + j] = x;
+                w[nwh + j + 1] = y;
+            }
+            bitrv2(nw, ip + 2, w);
+        }
+    }
+}
+
+
+void makect(int nc, int *ip, double *c)
+{
+    int j, nch;
+    double delta;
+    
+    ip[1] = nc;
+    if (nc > 1) {
+        nch = nc >> 1;
+        delta = atan(1.0) / nch;
+        c[0] = cos(delta * nch);
+        c[nch] = 0.5 * c[0];
+        for (j = 1; j < nch; j++) {
+            c[j] = 0.5 * cos(delta * j);
+            c[nc - j] = 0.5 * sin(delta * j);
+        }
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+void bitrv2(int n, int *ip, double *a)
+{
+    int j, j1, k, k1, l, m, m2;
+    double xr, xi, yr, yi;
+    
+    ip[0] = 0;
+    l = n;
+    m = 1;
+    while ((m << 3) < l) {
+        l >>= 1;
+        for (j = 0; j < m; j++) {
+            ip[m + j] = ip[j] + l;
+        }
+        m <<= 1;
+    }
+    m2 = 2 * m;
+    if ((m << 3) == l) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 -= m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            j1 = 2 * k + m2 + ip[k];
+            k1 = j1 + m2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    } else {
+        for (k = 1; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+        }
+    }
+}
+
+
+void bitrv2conj(int n, int *ip, double *a)
+{
+    int j, j1, k, k1, l, m, m2;
+    double xr, xi, yr, yi;
+    
+    ip[0] = 0;
+    l = n;
+    m = 1;
+    while ((m << 3) < l) {
+        l >>= 1;
+        for (j = 0; j < m; j++) {
+            ip[m + j] = ip[j] + l;
+        }
+        m <<= 1;
+    }
+    m2 = 2 * m;
+    if ((m << 3) == l) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 -= m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 2 * k + ip[k];
+            a[k1 + 1] = -a[k1 + 1];
+            j1 = k1 + m2;
+            k1 = j1 + m2;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            k1 += m2;
+            a[k1 + 1] = -a[k1 + 1];
+        }
+    } else {
+        a[1] = -a[1];
+        a[m2 + 1] = -a[m2 + 1];
+        for (k = 1; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 2 * k + ip[k];
+            a[k1 + 1] = -a[k1 + 1];
+            a[k1 + m2 + 1] = -a[k1 + m2 + 1];
+        }
+    }
+}
+
+
+void cftfsub(int n, double *a, double *w)
+{
+    void cft1st(int n, double *a, double *w);
+    void cftmdl(int n, int l, double *a, double *w);
+    int j, j1, j2, j3, l;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    l = 2;
+    if (n >= 16) {
+        cft1st(n, a, w);
+        l = 16;
+        while ((l << 3) <= n) {
+            cftmdl(n, l, a, w);
+            l <<= 3;
+        }
+    }
+    if ((l << 1) < n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i - x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i + x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i - x3r;
+        }
+    } else if ((l << 1) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = a[j + 1] - a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] += a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+void cftbsub(int n, double *a, double *w)
+{
+    void cft1st(int n, double *a, double *w);
+    void cftmdl(int n, int l, double *a, double *w);
+    int j, j1, j2, j3, j4, j5, j6, j7, l;
+    double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    l = 2;
+    if (n > 16) {
+        cft1st(n, a, w);
+        l = 16;
+        while ((l << 3) < n) {
+            cftmdl(n, l, a, w);
+            l <<= 3;
+        }
+    }
+    if ((l << 2) < n) {
+        wn4r = w[2];
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            j4 = j3 + l;
+            j5 = j4 + l;
+            j6 = j5 + l;
+            j7 = j6 + l;
+            x0r = a[j] + a[j1];
+            x0i = -a[j + 1] - a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = -a[j + 1] + a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            y0r = x0r + x2r;
+            y0i = x0i - x2i;
+            y2r = x0r - x2r;
+            y2i = x0i + x2i;
+            y1r = x1r - x3i;
+            y1i = x1i - x3r;
+            y3r = x1r + x3i;
+            y3i = x1i + x3r;
+            x0r = a[j4] + a[j5];
+            x0i = a[j4 + 1] + a[j5 + 1];
+            x1r = a[j4] - a[j5];
+            x1i = a[j4 + 1] - a[j5 + 1];
+            x2r = a[j6] + a[j7];
+            x2i = a[j6 + 1] + a[j7 + 1];
+            x3r = a[j6] - a[j7];
+            x3i = a[j6 + 1] - a[j7 + 1];
+            y4r = x0r + x2r;
+            y4i = x0i + x2i;
+            y6r = x0r - x2r;
+            y6i = x0i - x2i;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            x2r = x1r + x3i;
+            x2i = x1i - x3r;
+            y5r = wn4r * (x0r - x0i);
+            y5i = wn4r * (x0r + x0i);
+            y7r = wn4r * (x2r - x2i);
+            y7i = wn4r * (x2r + x2i);
+            a[j1] = y1r + y5r;
+            a[j1 + 1] = y1i - y5i;
+            a[j5] = y1r - y5r;
+            a[j5 + 1] = y1i + y5i;
+            a[j3] = y3r - y7i;
+            a[j3 + 1] = y3i - y7r;
+            a[j7] = y3r + y7i;
+            a[j7 + 1] = y3i + y7r;
+            a[j] = y0r + y4r;
+            a[j + 1] = y0i - y4i;
+            a[j4] = y0r - y4r;
+            a[j4 + 1] = y0i + y4i;
+            a[j2] = y2r - y6i;
+            a[j2 + 1] = y2i - y6r;
+            a[j6] = y2r + y6i;
+            a[j6 + 1] = y2i + y6r;
+        }
+    } else if ((l << 2) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = -a[j + 1] - a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = -a[j + 1] + a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i - x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i + x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i - x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i + x3r;
+        }
+    } else {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = -a[j + 1] + a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] = -a[j + 1] - a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+void cft1st(int n, double *a, double *w)
+{
+    int j, k1;
+    double wn4r, wtmp, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, 
+        wk4r, wk4i, wk5r, wk5i, wk6r, wk6i, wk7r, wk7i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    wn4r = w[2];
+    x0r = a[0] + a[2];
+    x0i = a[1] + a[3];
+    x1r = a[0] - a[2];
+    x1i = a[1] - a[3];
+    x2r = a[4] + a[6];
+    x2i = a[5] + a[7];
+    x3r = a[4] - a[6];
+    x3i = a[5] - a[7];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y2r = x0r - x2r;
+    y2i = x0i - x2i;
+    y1r = x1r - x3i;
+    y1i = x1i + x3r;
+    y3r = x1r + x3i;
+    y3i = x1i - x3r;
+    x0r = a[8] + a[10];
+    x0i = a[9] + a[11];
+    x1r = a[8] - a[10];
+    x1i = a[9] - a[11];
+    x2r = a[12] + a[14];
+    x2i = a[13] + a[15];
+    x3r = a[12] - a[14];
+    x3i = a[13] - a[15];
+    y4r = x0r + x2r;
+    y4i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    x2r = x1r + x3i;
+    x2i = x1i - x3r;
+    y5r = wn4r * (x0r - x0i);
+    y5i = wn4r * (x0r + x0i);
+    y7r = wn4r * (x2r - x2i);
+    y7i = wn4r * (x2r + x2i);
+    a[2] = y1r + y5r;
+    a[3] = y1i + y5i;
+    a[10] = y1r - y5r;
+    a[11] = y1i - y5i;
+    a[6] = y3r - y7i;
+    a[7] = y3i + y7r;
+    a[14] = y3r + y7i;
+    a[15] = y3i - y7r;
+    a[0] = y0r + y4r;
+    a[1] = y0i + y4i;
+    a[8] = y0r - y4r;
+    a[9] = y0i - y4i;
+    a[4] = y2r - y6i;
+    a[5] = y2i + y6r;
+    a[12] = y2r + y6i;
+    a[13] = y2i - y6r;
+    if (n > 16) {
+        wk1r = w[4];
+        wk1i = w[5];
+        x0r = a[16] + a[18];
+        x0i = a[17] + a[19];
+        x1r = a[16] - a[18];
+        x1i = a[17] - a[19];
+        x2r = a[20] + a[22];
+        x2i = a[21] + a[23];
+        x3r = a[20] - a[22];
+        x3i = a[21] - a[23];
+        y0r = x0r + x2r;
+        y0i = x0i + x2i;
+        y2r = x0r - x2r;
+        y2i = x0i - x2i;
+        y1r = x1r - x3i;
+        y1i = x1i + x3r;
+        y3r = x1r + x3i;
+        y3i = x1i - x3r;
+        x0r = a[24] + a[26];
+        x0i = a[25] + a[27];
+        x1r = a[24] - a[26];
+        x1i = a[25] - a[27];
+        x2r = a[28] + a[30];
+        x2i = a[29] + a[31];
+        x3r = a[28] - a[30];
+        x3i = a[29] - a[31];
+        y4r = x0r + x2r;
+        y4i = x0i + x2i;
+        y6r = x0r - x2r;
+        y6i = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        x2r = x1r + x3i;
+        x2i = x3r - x1i;
+        y5r = wk1i * x0r - wk1r * x0i;
+        y5i = wk1i * x0i + wk1r * x0r;
+        y7r = wk1r * x2r + wk1i * x2i;
+        y7i = wk1r * x2i - wk1i * x2r;
+        x0r = wk1r * y1r - wk1i * y1i;
+        x0i = wk1r * y1i + wk1i * y1r;
+        a[18] = x0r + y5r;
+        a[19] = x0i + y5i;
+        a[26] = y5i - x0i;
+        a[27] = x0r - y5r;
+        x0r = wk1i * y3r - wk1r * y3i;
+        x0i = wk1i * y3i + wk1r * y3r;
+        a[22] = x0r - y7r;
+        a[23] = x0i + y7i;
+        a[30] = y7i - x0i;
+        a[31] = x0r + y7r;
+        a[16] = y0r + y4r;
+        a[17] = y0i + y4i;
+        a[24] = y4i - y0i;
+        a[25] = y0r - y4r;
+        x0r = y2r - y6i;
+        x0i = y2i + y6r;
+        a[20] = wn4r * (x0r - x0i);
+        a[21] = wn4r * (x0i + x0r);
+        x0r = y6r - y2i;
+        x0i = y2r + y6i;
+        a[28] = wn4r * (x0r - x0i);
+        a[29] = wn4r * (x0i + x0r);
+        k1 = 4;
+        for (j = 32; j < n; j += 16) {
+            k1 += 4;
+            wk1r = w[k1];
+            wk1i = w[k1 + 1];
+            wk2r = w[k1 + 2];
+            wk2i = w[k1 + 3];
+            wtmp = 2 * wk2i;
+            wk3r = wk1r - wtmp * wk1i;
+            wk3i = wtmp * wk1r - wk1i;
+            wk4r = 1 - wtmp * wk2i;
+            wk4i = wtmp * wk2r;
+            wtmp = 2 * wk4i;
+            wk5r = wk3r - wtmp * wk1i;
+            wk5i = wtmp * wk1r - wk3i;
+            wk6r = wk2r - wtmp * wk2i;
+            wk6i = wtmp * wk2r - wk2i;
+            wk7r = wk1r - wtmp * wk3i;
+            wk7i = wtmp * wk3r - wk1i;
+            x0r = a[j] + a[j + 2];
+            x0i = a[j + 1] + a[j + 3];
+            x1r = a[j] - a[j + 2];
+            x1i = a[j + 1] - a[j + 3];
+            x2r = a[j + 4] + a[j + 6];
+            x2i = a[j + 5] + a[j + 7];
+            x3r = a[j + 4] - a[j + 6];
+            x3i = a[j + 5] - a[j + 7];
+            y0r = x0r + x2r;
+            y0i = x0i + x2i;
+            y2r = x0r - x2r;
+            y2i = x0i - x2i;
+            y1r = x1r - x3i;
+            y1i = x1i + x3r;
+            y3r = x1r + x3i;
+            y3i = x1i - x3r;
+            x0r = a[j + 8] + a[j + 10];
+            x0i = a[j + 9] + a[j + 11];
+            x1r = a[j + 8] - a[j + 10];
+            x1i = a[j + 9] - a[j + 11];
+            x2r = a[j + 12] + a[j + 14];
+            x2i = a[j + 13] + a[j + 15];
+            x3r = a[j + 12] - a[j + 14];
+            x3i = a[j + 13] - a[j + 15];
+            y4r = x0r + x2r;
+            y4i = x0i + x2i;
+            y6r = x0r - x2r;
+            y6i = x0i - x2i;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            x2r = x1r + x3i;
+            x2i = x1i - x3r;
+            y5r = wn4r * (x0r - x0i);
+            y5i = wn4r * (x0r + x0i);
+            y7r = wn4r * (x2r - x2i);
+            y7i = wn4r * (x2r + x2i);
+            x0r = y1r + y5r;
+            x0i = y1i + y5i;
+            a[j + 2] = wk1r * x0r - wk1i * x0i;
+            a[j + 3] = wk1r * x0i + wk1i * x0r;
+            x0r = y1r - y5r;
+            x0i = y1i - y5i;
+            a[j + 10] = wk5r * x0r - wk5i * x0i;
+            a[j + 11] = wk5r * x0i + wk5i * x0r;
+            x0r = y3r - y7i;
+            x0i = y3i + y7r;
+            a[j + 6] = wk3r * x0r - wk3i * x0i;
+            a[j + 7] = wk3r * x0i + wk3i * x0r;
+            x0r = y3r + y7i;
+            x0i = y3i - y7r;
+            a[j + 14] = wk7r * x0r - wk7i * x0i;
+            a[j + 15] = wk7r * x0i + wk7i * x0r;
+            a[j] = y0r + y4r;
+            a[j + 1] = y0i + y4i;
+            x0r = y0r - y4r;
+            x0i = y0i - y4i;
+            a[j + 8] = wk4r * x0r - wk4i * x0i;
+            a[j + 9] = wk4r * x0i + wk4i * x0r;
+            x0r = y2r - y6i;
+            x0i = y2i + y6r;
+            a[j + 4] = wk2r * x0r - wk2i * x0i;
+            a[j + 5] = wk2r * x0i + wk2i * x0r;
+            x0r = y2r + y6i;
+            x0i = y2i - y6r;
+            a[j + 12] = wk6r * x0r - wk6i * x0i;
+            a[j + 13] = wk6r * x0i + wk6i * x0r;
+        }
+    }
+}
+
+
+void cftmdl(int n, int l, double *a, double *w)
+{
+    int j, j1, j2, j3, j4, j5, j6, j7, k, k1, m;
+    double wn4r, wtmp, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, 
+        wk4r, wk4i, wk5r, wk5i, wk6r, wk6i, wk7r, wk7i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    m = l << 3;
+    wn4r = w[2];
+    for (j = 0; j < l; j += 2) {
+        j1 = j + l;
+        j2 = j1 + l;
+        j3 = j2 + l;
+        j4 = j3 + l;
+        j5 = j4 + l;
+        j6 = j5 + l;
+        j7 = j6 + l;
+        x0r = a[j] + a[j1];
+        x0i = a[j + 1] + a[j1 + 1];
+        x1r = a[j] - a[j1];
+        x1i = a[j + 1] - a[j1 + 1];
+        x2r = a[j2] + a[j3];
+        x2i = a[j2 + 1] + a[j3 + 1];
+        x3r = a[j2] - a[j3];
+        x3i = a[j2 + 1] - a[j3 + 1];
+        y0r = x0r + x2r;
+        y0i = x0i + x2i;
+        y2r = x0r - x2r;
+        y2i = x0i - x2i;
+        y1r = x1r - x3i;
+        y1i = x1i + x3r;
+        y3r = x1r + x3i;
+        y3i = x1i - x3r;
+        x0r = a[j4] + a[j5];
+        x0i = a[j4 + 1] + a[j5 + 1];
+        x1r = a[j4] - a[j5];
+        x1i = a[j4 + 1] - a[j5 + 1];
+        x2r = a[j6] + a[j7];
+        x2i = a[j6 + 1] + a[j7 + 1];
+        x3r = a[j6] - a[j7];
+        x3i = a[j6 + 1] - a[j7 + 1];
+        y4r = x0r + x2r;
+        y4i = x0i + x2i;
+        y6r = x0r - x2r;
+        y6i = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        x2r = x1r + x3i;
+        x2i = x1i - x3r;
+        y5r = wn4r * (x0r - x0i);
+        y5i = wn4r * (x0r + x0i);
+        y7r = wn4r * (x2r - x2i);
+        y7i = wn4r * (x2r + x2i);
+        a[j1] = y1r + y5r;
+        a[j1 + 1] = y1i + y5i;
+        a[j5] = y1r - y5r;
+        a[j5 + 1] = y1i - y5i;
+        a[j3] = y3r - y7i;
+        a[j3 + 1] = y3i + y7r;
+        a[j7] = y3r + y7i;
+        a[j7 + 1] = y3i - y7r;
+        a[j] = y0r + y4r;
+        a[j + 1] = y0i + y4i;
+        a[j4] = y0r - y4r;
+        a[j4 + 1] = y0i - y4i;
+        a[j2] = y2r - y6i;
+        a[j2 + 1] = y2i + y6r;
+        a[j6] = y2r + y6i;
+        a[j6 + 1] = y2i - y6r;
+    }
+    if (m < n) {
+        wk1r = w[4];
+        wk1i = w[5];
+        for (j = m; j < l + m; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            j4 = j3 + l;
+            j5 = j4 + l;
+            j6 = j5 + l;
+            j7 = j6 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            y0r = x0r + x2r;
+            y0i = x0i + x2i;
+            y2r = x0r - x2r;
+            y2i = x0i - x2i;
+            y1r = x1r - x3i;
+            y1i = x1i + x3r;
+            y3r = x1r + x3i;
+            y3i = x1i - x3r;
+            x0r = a[j4] + a[j5];
+            x0i = a[j4 + 1] + a[j5 + 1];
+            x1r = a[j4] - a[j5];
+            x1i = a[j4 + 1] - a[j5 + 1];
+            x2r = a[j6] + a[j7];
+            x2i = a[j6 + 1] + a[j7 + 1];
+            x3r = a[j6] - a[j7];
+            x3i = a[j6 + 1] - a[j7 + 1];
+            y4r = x0r + x2r;
+            y4i = x0i + x2i;
+            y6r = x0r - x2r;
+            y6i = x0i - x2i;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            x2r = x1r + x3i;
+            x2i = x3r - x1i;
+            y5r = wk1i * x0r - wk1r * x0i;
+            y5i = wk1i * x0i + wk1r * x0r;
+            y7r = wk1r * x2r + wk1i * x2i;
+            y7i = wk1r * x2i - wk1i * x2r;
+            x0r = wk1r * y1r - wk1i * y1i;
+            x0i = wk1r * y1i + wk1i * y1r;
+            a[j1] = x0r + y5r;
+            a[j1 + 1] = x0i + y5i;
+            a[j5] = y5i - x0i;
+            a[j5 + 1] = x0r - y5r;
+            x0r = wk1i * y3r - wk1r * y3i;
+            x0i = wk1i * y3i + wk1r * y3r;
+            a[j3] = x0r - y7r;
+            a[j3 + 1] = x0i + y7i;
+            a[j7] = y7i - x0i;
+            a[j7 + 1] = x0r + y7r;
+            a[j] = y0r + y4r;
+            a[j + 1] = y0i + y4i;
+            a[j4] = y4i - y0i;
+            a[j4 + 1] = y0r - y4r;
+            x0r = y2r - y6i;
+            x0i = y2i + y6r;
+            a[j2] = wn4r * (x0r - x0i);
+            a[j2 + 1] = wn4r * (x0i + x0r);
+            x0r = y6r - y2i;
+            x0i = y2r + y6i;
+            a[j6] = wn4r * (x0r - x0i);
+            a[j6 + 1] = wn4r * (x0i + x0r);
+        }
+        k1 = 4;
+        for (k = 2 * m; k < n; k += m) {
+            k1 += 4;
+            wk1r = w[k1];
+            wk1i = w[k1 + 1];
+            wk2r = w[k1 + 2];
+            wk2i = w[k1 + 3];
+            wtmp = 2 * wk2i;
+            wk3r = wk1r - wtmp * wk1i;
+            wk3i = wtmp * wk1r - wk1i;
+            wk4r = 1 - wtmp * wk2i;
+            wk4i = wtmp * wk2r;
+            wtmp = 2 * wk4i;
+            wk5r = wk3r - wtmp * wk1i;
+            wk5i = wtmp * wk1r - wk3i;
+            wk6r = wk2r - wtmp * wk2i;
+            wk6i = wtmp * wk2r - wk2i;
+            wk7r = wk1r - wtmp * wk3i;
+            wk7i = wtmp * wk3r - wk1i;
+            for (j = k; j < l + k; j += 2) {
+                j1 = j + l;
+                j2 = j1 + l;
+                j3 = j2 + l;
+                j4 = j3 + l;
+                j5 = j4 + l;
+                j6 = j5 + l;
+                j7 = j6 + l;
+                x0r = a[j] + a[j1];
+                x0i = a[j + 1] + a[j1 + 1];
+                x1r = a[j] - a[j1];
+                x1i = a[j + 1] - a[j1 + 1];
+                x2r = a[j2] + a[j3];
+                x2i = a[j2 + 1] + a[j3 + 1];
+                x3r = a[j2] - a[j3];
+                x3i = a[j2 + 1] - a[j3 + 1];
+                y0r = x0r + x2r;
+                y0i = x0i + x2i;
+                y2r = x0r - x2r;
+                y2i = x0i - x2i;
+                y1r = x1r - x3i;
+                y1i = x1i + x3r;
+                y3r = x1r + x3i;
+                y3i = x1i - x3r;
+                x0r = a[j4] + a[j5];
+                x0i = a[j4 + 1] + a[j5 + 1];
+                x1r = a[j4] - a[j5];
+                x1i = a[j4 + 1] - a[j5 + 1];
+                x2r = a[j6] + a[j7];
+                x2i = a[j6 + 1] + a[j7 + 1];
+                x3r = a[j6] - a[j7];
+                x3i = a[j6 + 1] - a[j7 + 1];
+                y4r = x0r + x2r;
+                y4i = x0i + x2i;
+                y6r = x0r - x2r;
+                y6i = x0i - x2i;
+                x0r = x1r - x3i;
+                x0i = x1i + x3r;
+                x2r = x1r + x3i;
+                x2i = x1i - x3r;
+                y5r = wn4r * (x0r - x0i);
+                y5i = wn4r * (x0r + x0i);
+                y7r = wn4r * (x2r - x2i);
+                y7i = wn4r * (x2r + x2i);
+                x0r = y1r + y5r;
+                x0i = y1i + y5i;
+                a[j1] = wk1r * x0r - wk1i * x0i;
+                a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+                x0r = y1r - y5r;
+                x0i = y1i - y5i;
+                a[j5] = wk5r * x0r - wk5i * x0i;
+                a[j5 + 1] = wk5r * x0i + wk5i * x0r;
+                x0r = y3r - y7i;
+                x0i = y3i + y7r;
+                a[j3] = wk3r * x0r - wk3i * x0i;
+                a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+                x0r = y3r + y7i;
+                x0i = y3i - y7r;
+                a[j7] = wk7r * x0r - wk7i * x0i;
+                a[j7 + 1] = wk7r * x0i + wk7i * x0r;
+                a[j] = y0r + y4r;
+                a[j + 1] = y0i + y4i;
+                x0r = y0r - y4r;
+                x0i = y0i - y4i;
+                a[j4] = wk4r * x0r - wk4i * x0i;
+                a[j4 + 1] = wk4r * x0i + wk4i * x0r;
+                x0r = y2r - y6i;
+                x0i = y2i + y6r;
+                a[j2] = wk2r * x0r - wk2i * x0i;
+                a[j2 + 1] = wk2r * x0i + wk2i * x0r;
+                x0r = y2r + y6i;
+                x0i = y2i - y6r;
+                a[j6] = wk6r * x0r - wk6i * x0i;
+                a[j6 + 1] = wk6r * x0i + wk6i * x0r;
+            }
+        }
+    }
+}
+
+
+void rftfsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+    
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr - wki * xi;
+        yi = wkr * xi + wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+
+void rftbsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+    
+    a[1] = -a[1];
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr + wki * xi;
+        yi = wkr * xi - wki * xr;
+        a[j] -= yr;
+        a[j + 1] = yi - a[j + 1];
+        a[k] += yr;
+        a[k + 1] = yi - a[k + 1];
+    }
+    a[m + 1] = -a[m + 1];
+}
+
+
+void dctsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+    
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[j] - wkr * a[k];
+        a[j] = wkr * a[j] + wki * a[k];
+        a[k] = xr;
+    }
+    a[m] *= c[0];
+}
+
+
+void dstsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+    
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[k] - wkr * a[j];
+        a[k] = wkr * a[k] + wki * a[j];
+        a[j] = xr;
+    }
+    a[m] *= c[0];
+}
+
diff --git a/fft8g.f b/fft8g.f
new file mode 100644
index 0000000..f9b8fa8
--- /dev/null
+++ b/fft8g.f
@@ -0,0 +1,1537 @@
+! Fast Fourier/Cosine/Sine Transform
+!     dimension   :one
+!     data length :power of 2
+!     decimation  :frequency
+!     radix       :8, 4, 2
+!     data        :inplace
+!     table       :use
+! subroutines
+!     cdft: Complex Discrete Fourier Transform
+!     rdft: Real Discrete Fourier Transform
+!     ddct: Discrete Cosine Transform
+!     ddst: Discrete Sine Transform
+!     dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+!     dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+!
+!
+! -------- Complex DFT (Discrete Fourier Transform) --------
+!     [definition]
+!         <case1>
+!             X(k) = sum_j=0^n-1 x(j)*exp(2*pi*i*j*k/n), 0<=k<n
+!         <case2>
+!             X(k) = sum_j=0^n-1 x(j)*exp(-2*pi*i*j*k/n), 0<=k<n
+!         (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call cdft(2*n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call cdft(2*n, -1, a, ip, w)
+!     [parameters]
+!         2*n          :data length (integer)
+!                       n >= 1, n = power of 2
+!         a(0:2*n-1)   :input/output data (real*8)
+!                       input data
+!                           a(2*j) = Re(x(j)), 
+!                           a(2*j+1) = Im(x(j)), 0<=j<n
+!                       output data
+!                           a(2*k) = Re(X(k)), 
+!                           a(2*k+1) = Im(X(k)), 0<=k<n
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n/2-1)   :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call cdft(2*n, -1, a, ip, w)
+!         is 
+!             call cdft(2*n, 1, a, ip, w)
+!             do j = 0, 2 * n - 1
+!                 a(j) = a(j) / n
+!             end do
+!         .
+!
+!
+! -------- Real DFT / Inverse of Real DFT --------
+!     [definition]
+!         <case1> RDFT
+!             R(k) = sum_j=0^n-1 a(j)*cos(2*pi*j*k/n), 0<=k<=n/2
+!             I(k) = sum_j=0^n-1 a(j)*sin(2*pi*j*k/n), 0<k<n/2
+!         <case2> IRDFT (excluding scale)
+!             a(k) = (R(0) + R(n/2)*cos(pi*k))/2 + 
+!                    sum_j=1^n/2-1 R(j)*cos(2*pi*j*k/n) + 
+!                    sum_j=1^n/2-1 I(j)*sin(2*pi*j*k/n), 0<=k<n
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call rdft(n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call rdft(n, -1, a, ip, w)
+!     [parameters]
+!         n            :data length (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       <case1>
+!                           output data
+!                               a(2*k) = R(k), 0<=k<n/2
+!                               a(2*k+1) = I(k), 0<k<n/2
+!                               a(1) = R(n/2)
+!                       <case2>
+!                           input data
+!                               a(2*j) = R(j), 0<=j<n/2
+!                               a(2*j+1) = I(j), 0<j<n/2
+!                               a(1) = R(n/2)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/2)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n/2-1)   :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call rdft(n, 1, a, ip, w)
+!         is 
+!             call rdft(n, -1, a, ip, w)
+!             do j = 0, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+!     [definition]
+!         <case1> IDCT (excluding scale)
+!             C(k) = sum_j=0^n-1 a(j)*cos(pi*j*(k+1/2)/n), 0<=k<n
+!         <case2> DCT
+!             C(k) = sum_j=0^n-1 a(j)*cos(pi*(j+1/2)*k/n), 0<=k<n
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddct(n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddct(n, -1, a, ip, w)
+!     [parameters]
+!         n            :data length (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       output data
+!                           a(k) = C(k), 0<=k<n
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/2)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/4-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddct(n, -1, a, ip, w)
+!         is 
+!             a(0) = a(0) / 2
+!             call ddct(n, 1, a, ip, w)
+!             do j = 0, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- DST (Discrete Sine Transform) / Inverse of DST --------
+!     [definition]
+!         <case1> IDST (excluding scale)
+!             S(k) = sum_j=1^n A(j)*sin(pi*j*(k+1/2)/n), 0<=k<n
+!         <case2> DST
+!             S(k) = sum_j=0^n-1 a(j)*sin(pi*(j+1/2)*k/n), 0<k<=n
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddst(n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddst(n, -1, a, ip, w)
+!     [parameters]
+!         n            :data length (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       <case1>
+!                           input data
+!                               a(j) = A(j), 0<j<n
+!                               a(0) = A(n)
+!                           output data
+!                               a(k) = S(k), 0<=k<n
+!                       <case2>
+!                           output data
+!                               a(k) = S(k), 0<k<n
+!                               a(0) = S(n)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/2)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/4-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddst(n, -1, a, ip, w)
+!         is 
+!             a(0) = a(0) / 2
+!             call ddst(n, 1, a, ip, w)
+!             do j = 0, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+!     [definition]
+!         C(k) = sum_j=0^n a(j)*cos(pi*j*k/n), 0<=k<=n
+!     [usage]
+!         ip(0) = 0  ! first time only
+!         call dfct(n, a, t, ip, w)
+!     [parameters]
+!         n            :data length - 1 (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n)       :input/output data (real*8)
+!                       output data
+!                           a(k) = C(k), 0<=k<=n
+!         t(0:n/2)     :work area (real*8)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/4)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/4+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/8-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             a(0) = a(0) / 2
+!             a(n) = a(n) / 2
+!             call dfct(n, a, t, ip, w)
+!         is 
+!             a(0) = a(0) / 2
+!             a(n) = a(n) / 2
+!             call dfct(n, a, t, ip, w)
+!             do j = 0, n
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+!     [definition]
+!         S(k) = sum_j=1^n-1 a(j)*sin(pi*j*k/n), 0<k<n
+!     [usage]
+!         ip(0) = 0  ! first time only
+!         call dfst(n, a, t, ip, w)
+!     [parameters]
+!         n            :data length + 1 (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       output data
+!                           a(k) = S(k), 0<k<n
+!                       (a(0) is used for work area)
+!         t(0:n/2-1)   :work area (real*8)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/4)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/4+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/8-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call dfst(n, a, t, ip, w)
+!         is 
+!             call dfst(n, a, t, ip, w)
+!             do j = 1, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! Appendix :
+!     The cos/sin table is recalculated when the larger table required.
+!     w() and ip() are compatible with all routines.
+!
+!
+      subroutine cdft(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *)
+      real*8 a(0 : n - 1), w(0 : *)
+      if (n .gt. 4 * ip(0)) then
+          call makewt(n / 4, ip, w)
+      end if
+      if (n .gt. 4) then
+          if (isgn .ge. 0) then
+              call bitrv2(n, ip(2), a)
+              call cftfsub(n, a, w)
+          else
+              call bitrv2conj(n, ip(2), a)
+              call cftbsub(n, a, w)
+          end if
+      else if (n .eq. 4) then
+          call cftfsub(n, a, w)
+      end if
+      end
+!
+      subroutine rdft(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *), nw, nc
+      real*8 a(0 : n - 1), w(0 : *), xi
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. 4 * nc) then
+          nc = n / 4
+          call makect(nc, ip, w(nw))
+      end if
+      if (isgn .ge. 0) then
+          if (n .gt. 4) then
+              call bitrv2(n, ip(2), a)
+              call cftfsub(n, a, w)
+              call rftfsub(n, a, nc, w(nw))
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+          xi = a(0) - a(1)
+          a(0) = a(0) + a(1)
+          a(1) = xi
+      else
+          a(1) = 0.5d0 * (a(0) - a(1))
+          a(0) = a(0) - a(1)
+          if (n .gt. 4) then
+              call rftbsub(n, a, nc, w(nw))
+              call bitrv2(n, ip(2), a)
+              call cftbsub(n, a, w)
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+      end if
+      end
+!
+      subroutine ddct(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *), j, nw, nc
+      real*8 a(0 : n - 1), w(0 : *), xr
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. nc) then
+          nc = n
+          call makect(nc, ip, w(nw))
+      end if
+      if (isgn .lt. 0) then
+          xr = a(n - 1)
+          do j = n - 2, 2, -2
+              a(j + 1) = a(j) - a(j - 1)
+              a(j) = a(j) + a(j - 1)
+          end do
+          a(1) = a(0) - xr
+          a(0) = a(0) + xr
+          if (n .gt. 4) then
+              call rftbsub(n, a, nc, w(nw))
+              call bitrv2(n, ip(2), a)
+              call cftbsub(n, a, w)
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+      end if
+      call dctsub(n, a, nc, w(nw))
+      if (isgn .ge. 0) then
+          if (n .gt. 4) then
+              call bitrv2(n, ip(2), a)
+              call cftfsub(n, a, w)
+              call rftfsub(n, a, nc, w(nw))
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+          xr = a(0) - a(1)
+          a(0) = a(0) + a(1)
+          do j = 2, n - 2, 2
+              a(j - 1) = a(j) - a(j + 1)
+              a(j) = a(j) + a(j + 1)
+          end do
+          a(n - 1) = xr
+      end if
+      end
+!
+      subroutine ddst(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *), j, nw, nc
+      real*8 a(0 : n - 1), w(0 : *), xr
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. nc) then
+          nc = n
+          call makect(nc, ip, w(nw))
+      end if
+      if (isgn .lt. 0) then
+          xr = a(n - 1)
+          do j = n - 2, 2, -2
+              a(j + 1) = -a(j) - a(j - 1)
+              a(j) = a(j) - a(j - 1)
+          end do
+          a(1) = a(0) + xr
+          a(0) = a(0) - xr
+          if (n .gt. 4) then
+              call rftbsub(n, a, nc, w(nw))
+              call bitrv2(n, ip(2), a)
+              call cftbsub(n, a, w)
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+      end if
+      call dstsub(n, a, nc, w(nw))
+      if (isgn .ge. 0) then
+          if (n .gt. 4) then
+              call bitrv2(n, ip(2), a)
+              call cftfsub(n, a, w)
+              call rftfsub(n, a, nc, w(nw))
+          else if (n .eq. 4) then
+              call cftfsub(n, a, w)
+          end if
+          xr = a(0) - a(1)
+          a(0) = a(0) + a(1)
+          do j = 2, n - 2, 2
+              a(j - 1) = -a(j) - a(j + 1)
+              a(j) = a(j) - a(j + 1)
+          end do
+          a(n - 1) = -xr
+      end if
+      end
+!
+      subroutine dfct(n, a, t, ip, w)
+      integer n, ip(0 : *), j, k, l, m, mh, nw, nc
+      real*8 a(0 : n), t(0 : n / 2), w(0 : *), xr, xi, yr, yi
+      nw = ip(0)
+      if (n .gt. 8 * nw) then
+          nw = n / 8
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. 2 * nc) then
+          nc = n / 2
+          call makect(nc, ip, w(nw))
+      end if
+      m = n / 2
+      yi = a(m)
+      xi = a(0) + a(n)
+      a(0) = a(0) - a(n)
+      t(0) = xi - yi
+      t(m) = xi + yi
+      if (n .gt. 2) then
+          mh = m / 2
+          do j = 1, mh - 1
+              k = m - j
+              xr = a(j) - a(n - j)
+              xi = a(j) + a(n - j)
+              yr = a(k) - a(n - k)
+              yi = a(k) + a(n - k)
+              a(j) = xr
+              a(k) = yr
+              t(j) = xi - yi
+              t(k) = xi + yi
+          end do
+          t(mh) = a(mh) + a(n - mh)
+          a(mh) = a(mh) - a(n - mh)
+          call dctsub(m, a, nc, w(nw))
+          if (m .gt. 4) then
+              call bitrv2(m, ip(2), a)
+              call cftfsub(m, a, w)
+              call rftfsub(m, a, nc, w(nw))
+          else if (m .eq. 4) then
+              call cftfsub(m, a, w)
+          end if
+          a(n - 1) = a(0) - a(1)
+          a(1) = a(0) + a(1)
+          do j = m - 2, 2, -2
+              a(2 * j + 1) = a(j) + a(j + 1)
+              a(2 * j - 1) = a(j) - a(j + 1)
+          end do
+          l = 2
+          m = mh
+          do while (m .ge. 2)
+              call dctsub(m, t, nc, w(nw))
+              if (m .gt. 4) then
+                  call bitrv2(m, ip(2), t)
+                  call cftfsub(m, t, w)
+                  call rftfsub(m, t, nc, w(nw))
+              else if (m .eq. 4) then
+                  call cftfsub(m, t, w)
+              end if
+              a(n - l) = t(0) - t(1)
+              a(l) = t(0) + t(1)
+              k = 0
+              do j = 2, m - 2, 2
+                  k = k + 4 * l
+                  a(k - l) = t(j) - t(j + 1)
+                  a(k + l) = t(j) + t(j + 1)
+              end do
+              l = 2 * l
+              mh = m / 2
+              do j = 0, mh - 1
+                  k = m - j
+                  t(j) = t(m + k) - t(m + j)
+                  t(k) = t(m + k) + t(m + j)
+              end do
+              t(mh) = t(m + mh)
+              m = mh
+          end do
+          a(l) = t(0)
+          a(n) = t(2) - t(1)
+          a(0) = t(2) + t(1)
+      else
+          a(1) = a(0)
+          a(2) = t(0)
+          a(0) = t(1)
+      end if
+      end
+!
+      subroutine dfst(n, a, t, ip, w)
+      integer n, ip(0 : *), j, k, l, m, mh, nw, nc
+      real*8 a(0 : n - 1), t(0 : n / 2 - 1), w(0 : *), xr, xi, yr, yi
+      nw = ip(0)
+      if (n .gt. 8 * nw) then
+          nw = n / 8
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. 2 * nc) then
+          nc = n / 2
+          call makect(nc, ip, w(nw))
+      end if
+      if (n .gt. 2) then
+          m = n / 2
+          mh = m / 2
+          do j = 1, mh - 1
+              k = m - j
+              xr = a(j) + a(n - j)
+              xi = a(j) - a(n - j)
+              yr = a(k) + a(n - k)
+              yi = a(k) - a(n - k)
+              a(j) = xr
+              a(k) = yr
+              t(j) = xi + yi
+              t(k) = xi - yi
+          end do
+          t(0) = a(mh) - a(n - mh)
+          a(mh) = a(mh) + a(n - mh)
+          a(0) = a(m)
+          call dstsub(m, a, nc, w(nw))
+          if (m .gt. 4) then
+              call bitrv2(m, ip(2), a)
+              call cftfsub(m, a, w)
+              call rftfsub(m, a, nc, w(nw))
+          else if (m .eq. 4) then
+              call cftfsub(m, a, w)
+          end if
+          a(n - 1) = a(1) - a(0)
+          a(1) = a(0) + a(1)
+          do j = m - 2, 2, -2
+              a(2 * j + 1) = a(j) - a(j + 1)
+              a(2 * j - 1) = -a(j) - a(j + 1)
+          end do
+          l = 2
+          m = mh
+          do while (m .ge. 2)
+              call dstsub(m, t, nc, w(nw))
+              if (m .gt. 4) then
+                  call bitrv2(m, ip(2), t)
+                  call cftfsub(m, t, w)
+                  call rftfsub(m, t, nc, w(nw))
+              else if (m .eq. 4) then
+                  call cftfsub(m, t, w)
+              end if
+              a(n - l) = t(1) - t(0)
+              a(l) = t(0) + t(1)
+              k = 0
+              do j = 2, m - 2, 2
+                  k = k + 4 * l
+                  a(k - l) = -t(j) - t(j + 1)
+                  a(k + l) = t(j) - t(j + 1)
+              end do
+              l = 2 * l
+              mh = m / 2
+              do j = 1, mh - 1
+                  k = m - j
+                  t(j) = t(m + k) + t(m + j)
+                  t(k) = t(m + k) - t(m + j)
+              end do
+              t(0) = t(m + mh)
+              m = mh
+          end do
+          a(l) = t(0)
+      end if
+      a(0) = 0
+      end
+!
+! -------- initializing routines --------
+!
+      subroutine makewt(nw, ip, w)
+      integer nw, ip(0 : *), j, nwh
+      real*8 w(0 : nw - 1), delta, x, y
+      ip(0) = nw
+      ip(1) = 1
+      if (nw .gt. 2) then
+          nwh = nw / 2
+          delta = atan(1.0d0) / nwh
+          w(0) = 1
+          w(1) = 0
+          w(nwh) = cos(delta * nwh)
+          w(nwh + 1) = w(nwh)
+          if (nwh .gt. 2) then
+              do j = 2, nwh - 2, 2
+                  x = cos(delta * j)
+                  y = sin(delta * j)
+                  w(j) = x
+                  w(j + 1) = y
+                  w(nw - j) = y
+                  w(nw - j + 1) = x
+              end do
+              do j = nwh - 2, 2, -2
+                  x = w(2 * j)
+                  y = w(2 * j + 1)
+                  w(nwh + j) = x
+                  w(nwh + j + 1) = y
+              end do
+              call bitrv2(nw, ip(2), w)
+          end if
+      end if
+      end
+!
+      subroutine makect(nc, ip, c)
+      integer nc, ip(0 : *), j, nch
+      real*8 c(0 : nc - 1), delta
+      ip(1) = nc
+      if (nc .gt. 1) then
+          nch = nc / 2
+          delta = atan(1.0d0) / nch
+          c(0) = cos(delta * nch)
+          c(nch) = 0.5d0 * c(0)
+          do j = 1, nch - 1
+              c(j) = 0.5d0 * cos(delta * j)
+              c(nc - j) = 0.5d0 * sin(delta * j)
+          end do
+      end if
+      end
+!
+! -------- child routines --------
+!
+      subroutine bitrv2(n, ip, a)
+      integer n, ip(0 : *), j, j1, k, k1, l, m, m2
+      real*8 a(0 : n - 1), xr, xi, yr, yi
+      ip(0) = 0
+      l = n
+      m = 1
+      do while (8 * m .lt. l)
+          l = l / 2
+          do j = 0, m - 1
+              ip(m + j) = ip(j) + l
+          end do
+          m = m * 2
+      end do
+      m2 = 2 * m
+      if (8 * m .eq. l) then
+          do k = 0, m - 1
+              do j = 0, k - 1
+                  j1 = 2 * j + ip(k)
+                  k1 = 2 * k + ip(j)
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + 2 * m2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 - m2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + 2 * m2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+              j1 = 2 * k + m2 + ip(k)
+              k1 = j1 + m2
+              xr = a(j1)
+              xi = a(j1 + 1)
+              yr = a(k1)
+              yi = a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+          end do
+      else
+          do k = 1, m - 1
+              do j = 0, k - 1
+                  j1 = 2 * j + ip(k)
+                  k1 = 2 * k + ip(j)
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + m2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+          end do
+      end if
+      end
+!
+      subroutine bitrv2conj(n, ip, a)
+      integer n, ip(0 : *), j, j1, k, k1, l, m, m2
+      real*8 a(0 : n - 1), xr, xi, yr, yi
+      ip(0) = 0
+      l = n
+      m = 1
+      do while (8 * m .lt. l)
+          l = l / 2
+          do j = 0, m - 1
+              ip(m + j) = ip(j) + l
+          end do
+          m = m * 2
+      end do
+      m2 = 2 * m
+      if (8 * m .eq. l) then
+          do k = 0, m - 1
+              do j = 0, k - 1
+                  j1 = 2 * j + ip(k)
+                  k1 = 2 * k + ip(j)
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + 2 * m2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 - m2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + 2 * m2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+              k1 = 2 * k + ip(k)
+              a(k1 + 1) = -a(k1 + 1)
+              j1 = k1 + m2
+              k1 = j1 + m2
+              xr = a(j1)
+              xi = -a(j1 + 1)
+              yr = a(k1)
+              yi = -a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              k1 = k1 + m2
+              a(k1 + 1) = -a(k1 + 1)
+          end do
+      else
+          a(1) = -a(1)
+          a(m2 + 1) = -a(m2 + 1)
+          do k = 1, m - 1
+              do j = 0, k - 1
+                  j1 = 2 * j + ip(k)
+                  k1 = 2 * k + ip(j)
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + m2
+                  k1 = k1 + m2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+              k1 = 2 * k + ip(k)
+              a(k1 + 1) = -a(k1 + 1)
+              a(k1 + m2 + 1) = -a(k1 + m2 + 1)
+          end do
+      end if
+      end
+!
+      subroutine cftfsub(n, a, w)
+      integer n, j, j1, j2, j3, l
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      l = 2
+      if (n .ge. 16) then
+          call cft1st(n, a, w)
+          l = 16
+          do while (8 * l .le. n)
+              call cftmdl(n, l, a, w)
+              l = 8 * l
+          end do
+      end if
+      if (2 * l .lt. n) then
+          do j = 0, l - 2, 2
+              j1 = j + l
+              j2 = j1 + l
+              j3 = j2 + l
+              x0r = a(j) + a(j1)
+              x0i = a(j + 1) + a(j1 + 1)
+              x1r = a(j) - a(j1)
+              x1i = a(j + 1) - a(j1 + 1)
+              x2r = a(j2) + a(j3)
+              x2i = a(j2 + 1) + a(j3 + 1)
+              x3r = a(j2) - a(j3)
+              x3i = a(j2 + 1) - a(j3 + 1)
+              a(j) = x0r + x2r
+              a(j + 1) = x0i + x2i
+              a(j2) = x0r - x2r
+              a(j2 + 1) = x0i - x2i
+              a(j1) = x1r - x3i
+              a(j1 + 1) = x1i + x3r
+              a(j3) = x1r + x3i
+              a(j3 + 1) = x1i - x3r
+          end do
+      else if (2 * l .eq. n) then
+          do j = 0, l - 2, 2
+              j1 = j + l
+              x0r = a(j) - a(j1)
+              x0i = a(j + 1) - a(j1 + 1)
+              a(j) = a(j) + a(j1)
+              a(j + 1) = a(j + 1) + a(j1 + 1)
+              a(j1) = x0r
+              a(j1 + 1) = x0i
+          end do
+      end if
+      end
+!
+      subroutine cftbsub(n, a, w)
+      integer n, j, j1, j2, j3, j4, j5, j6, j7, l
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+      real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
+      l = 2
+      if (n .gt. 16) then
+          call cft1st(n, a, w)
+          l = 16
+          do while (8 * l .lt. n)
+              call cftmdl(n, l, a, w)
+              l = 8 * l
+          end do
+      end if
+      if (4 * l .lt. n) then
+          wn4r = w(2)
+          do j = 0, l - 2, 2
+              j1 = j + l
+              j2 = j1 + l
+              j3 = j2 + l
+              j4 = j3 + l
+              j5 = j4 + l
+              j6 = j5 + l
+              j7 = j6 + l
+              x0r = a(j) + a(j1)
+              x0i = -a(j + 1) - a(j1 + 1)
+              x1r = a(j) - a(j1)
+              x1i = -a(j + 1) + a(j1 + 1)
+              x2r = a(j2) + a(j3)
+              x2i = a(j2 + 1) + a(j3 + 1)
+              x3r = a(j2) - a(j3)
+              x3i = a(j2 + 1) - a(j3 + 1)
+              y0r = x0r + x2r
+              y0i = x0i - x2i
+              y2r = x0r - x2r
+              y2i = x0i + x2i
+              y1r = x1r - x3i
+              y1i = x1i - x3r
+              y3r = x1r + x3i
+              y3i = x1i + x3r
+              x0r = a(j4) + a(j5)
+              x0i = a(j4 + 1) + a(j5 + 1)
+              x1r = a(j4) - a(j5)
+              x1i = a(j4 + 1) - a(j5 + 1)
+              x2r = a(j6) + a(j7)
+              x2i = a(j6 + 1) + a(j7 + 1)
+              x3r = a(j6) - a(j7)
+              x3i = a(j6 + 1) - a(j7 + 1)
+              y4r = x0r + x2r
+              y4i = x0i + x2i
+              y6r = x0r - x2r
+              y6i = x0i - x2i
+              x0r = x1r - x3i
+              x0i = x1i + x3r
+              x2r = x1r + x3i
+              x2i = x1i - x3r
+              y5r = wn4r * (x0r - x0i)
+              y5i = wn4r * (x0r + x0i)
+              y7r = wn4r * (x2r - x2i)
+              y7i = wn4r * (x2r + x2i)
+              a(j1) = y1r + y5r
+              a(j1 + 1) = y1i - y5i
+              a(j5) = y1r - y5r
+              a(j5 + 1) = y1i + y5i
+              a(j3) = y3r - y7i
+              a(j3 + 1) = y3i - y7r
+              a(j7) = y3r + y7i
+              a(j7 + 1) = y3i + y7r
+              a(j) = y0r + y4r
+              a(j + 1) = y0i - y4i
+              a(j4) = y0r - y4r
+              a(j4 + 1) = y0i + y4i
+              a(j2) = y2r - y6i
+              a(j2 + 1) = y2i - y6r
+              a(j6) = y2r + y6i
+              a(j6 + 1) = y2i + y6r
+          end do
+      else if (4 * l .eq. n) then
+          do j = 0, l - 2, 2
+              j1 = j + l
+              j2 = j1 + l
+              j3 = j2 + l
+              x0r = a(j) + a(j1)
+              x0i = -a(j + 1) - a(j1 + 1)
+              x1r = a(j) - a(j1)
+              x1i = -a(j + 1) + a(j1 + 1)
+              x2r = a(j2) + a(j3)
+              x2i = a(j2 + 1) + a(j3 + 1)
+              x3r = a(j2) - a(j3)
+              x3i = a(j2 + 1) - a(j3 + 1)
+              a(j) = x0r + x2r
+              a(j + 1) = x0i - x2i
+              a(j2) = x0r - x2r
+              a(j2 + 1) = x0i + x2i
+              a(j1) = x1r - x3i
+              a(j1 + 1) = x1i - x3r
+              a(j3) = x1r + x3i
+              a(j3 + 1) = x1i + x3r
+          end do
+      else
+          do j = 0, l - 2, 2
+              j1 = j + l
+              x0r = a(j) - a(j1)
+              x0i = -a(j + 1) + a(j1 + 1)
+              a(j) = a(j) + a(j1)
+              a(j + 1) = -a(j + 1) - a(j1 + 1)
+              a(j1) = x0r
+              a(j1 + 1) = x0i
+          end do
+      end if
+      end
+!
+      subroutine cft1st(n, a, w)
+      integer n, j, k1
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 wn4r, wtmp, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+      real*8 wk4r, wk4i, wk5r, wk5i, wk6r, wk6i, wk7r, wk7i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+      real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
+      wn4r = w(2)
+      x0r = a(0) + a(2)
+      x0i = a(1) + a(3)
+      x1r = a(0) - a(2)
+      x1i = a(1) - a(3)
+      x2r = a(4) + a(6)
+      x2i = a(5) + a(7)
+      x3r = a(4) - a(6)
+      x3i = a(5) - a(7)
+      y0r = x0r + x2r
+      y0i = x0i + x2i
+      y2r = x0r - x2r
+      y2i = x0i - x2i
+      y1r = x1r - x3i
+      y1i = x1i + x3r
+      y3r = x1r + x3i
+      y3i = x1i - x3r
+      x0r = a(8) + a(10)
+      x0i = a(9) + a(11)
+      x1r = a(8) - a(10)
+      x1i = a(9) - a(11)
+      x2r = a(12) + a(14)
+      x2i = a(13) + a(15)
+      x3r = a(12) - a(14)
+      x3i = a(13) - a(15)
+      y4r = x0r + x2r
+      y4i = x0i + x2i
+      y6r = x0r - x2r
+      y6i = x0i - x2i
+      x0r = x1r - x3i
+      x0i = x1i + x3r
+      x2r = x1r + x3i
+      x2i = x1i - x3r
+      y5r = wn4r * (x0r - x0i)
+      y5i = wn4r * (x0r + x0i)
+      y7r = wn4r * (x2r - x2i)
+      y7i = wn4r * (x2r + x2i)
+      a(2) = y1r + y5r
+      a(3) = y1i + y5i
+      a(10) = y1r - y5r
+      a(11) = y1i - y5i
+      a(6) = y3r - y7i
+      a(7) = y3i + y7r
+      a(14) = y3r + y7i
+      a(15) = y3i - y7r
+      a(0) = y0r + y4r
+      a(1) = y0i + y4i
+      a(8) = y0r - y4r
+      a(9) = y0i - y4i
+      a(4) = y2r - y6i
+      a(5) = y2i + y6r
+      a(12) = y2r + y6i
+      a(13) = y2i - y6r
+      if (n .gt. 16) then
+          wk1r = w(4)
+          wk1i = w(5)
+          x0r = a(16) + a(18)
+          x0i = a(17) + a(19)
+          x1r = a(16) - a(18)
+          x1i = a(17) - a(19)
+          x2r = a(20) + a(22)
+          x2i = a(21) + a(23)
+          x3r = a(20) - a(22)
+          x3i = a(21) - a(23)
+          y0r = x0r + x2r
+          y0i = x0i + x2i
+          y2r = x0r - x2r
+          y2i = x0i - x2i
+          y1r = x1r - x3i
+          y1i = x1i + x3r
+          y3r = x1r + x3i
+          y3i = x1i - x3r
+          x0r = a(24) + a(26)
+          x0i = a(25) + a(27)
+          x1r = a(24) - a(26)
+          x1i = a(25) - a(27)
+          x2r = a(28) + a(30)
+          x2i = a(29) + a(31)
+          x3r = a(28) - a(30)
+          x3i = a(29) - a(31)
+          y4r = x0r + x2r
+          y4i = x0i + x2i
+          y6r = x0r - x2r
+          y6i = x0i - x2i
+          x0r = x1r - x3i
+          x0i = x1i + x3r
+          x2r = x1r + x3i
+          x2i = x3r - x1i
+          y5r = wk1i * x0r - wk1r * x0i
+          y5i = wk1i * x0i + wk1r * x0r
+          y7r = wk1r * x2r + wk1i * x2i
+          y7i = wk1r * x2i - wk1i * x2r
+          x0r = wk1r * y1r - wk1i * y1i
+          x0i = wk1r * y1i + wk1i * y1r
+          a(18) = x0r + y5r
+          a(19) = x0i + y5i
+          a(26) = y5i - x0i
+          a(27) = x0r - y5r
+          x0r = wk1i * y3r - wk1r * y3i
+          x0i = wk1i * y3i + wk1r * y3r
+          a(22) = x0r - y7r
+          a(23) = x0i + y7i
+          a(30) = y7i - x0i
+          a(31) = x0r + y7r
+          a(16) = y0r + y4r
+          a(17) = y0i + y4i
+          a(24) = y4i - y0i
+          a(25) = y0r - y4r
+          x0r = y2r - y6i
+          x0i = y2i + y6r
+          a(20) = wn4r * (x0r - x0i)
+          a(21) = wn4r * (x0i + x0r)
+          x0r = y6r - y2i
+          x0i = y2r + y6i
+          a(28) = wn4r * (x0r - x0i)
+          a(29) = wn4r * (x0i + x0r)
+          k1 = 4
+          do j = 32, n - 16, 16
+              k1 = k1 + 4
+              wk1r = w(k1)
+              wk1i = w(k1 + 1)
+              wk2r = w(k1 + 2)
+              wk2i = w(k1 + 3)
+              wtmp = 2 * wk2i
+              wk3r = wk1r - wtmp * wk1i
+              wk3i = wtmp * wk1r - wk1i
+              wk4r = 1 - wtmp * wk2i
+              wk4i = wtmp * wk2r
+              wtmp = 2 * wk4i
+              wk5r = wk3r - wtmp * wk1i
+              wk5i = wtmp * wk1r - wk3i
+              wk6r = wk2r - wtmp * wk2i
+              wk6i = wtmp * wk2r - wk2i
+              wk7r = wk1r - wtmp * wk3i
+              wk7i = wtmp * wk3r - wk1i
+              x0r = a(j) + a(j + 2)
+              x0i = a(j + 1) + a(j + 3)
+              x1r = a(j) - a(j + 2)
+              x1i = a(j + 1) - a(j + 3)
+              x2r = a(j + 4) + a(j + 6)
+              x2i = a(j + 5) + a(j + 7)
+              x3r = a(j + 4) - a(j + 6)
+              x3i = a(j + 5) - a(j + 7)
+              y0r = x0r + x2r
+              y0i = x0i + x2i
+              y2r = x0r - x2r
+              y2i = x0i - x2i
+              y1r = x1r - x3i
+              y1i = x1i + x3r
+              y3r = x1r + x3i
+              y3i = x1i - x3r
+              x0r = a(j + 8) + a(j + 10)
+              x0i = a(j + 9) + a(j + 11)
+              x1r = a(j + 8) - a(j + 10)
+              x1i = a(j + 9) - a(j + 11)
+              x2r = a(j + 12) + a(j + 14)
+              x2i = a(j + 13) + a(j + 15)
+              x3r = a(j + 12) - a(j + 14)
+              x3i = a(j + 13) - a(j + 15)
+              y4r = x0r + x2r
+              y4i = x0i + x2i
+              y6r = x0r - x2r
+              y6i = x0i - x2i
+              x0r = x1r - x3i
+              x0i = x1i + x3r
+              x2r = x1r + x3i
+              x2i = x1i - x3r
+              y5r = wn4r * (x0r - x0i)
+              y5i = wn4r * (x0r + x0i)
+              y7r = wn4r * (x2r - x2i)
+              y7i = wn4r * (x2r + x2i)
+              x0r = y1r + y5r
+              x0i = y1i + y5i
+              a(j + 2) = wk1r * x0r - wk1i * x0i
+              a(j + 3) = wk1r * x0i + wk1i * x0r
+              x0r = y1r - y5r
+              x0i = y1i - y5i
+              a(j + 10) = wk5r * x0r - wk5i * x0i
+              a(j + 11) = wk5r * x0i + wk5i * x0r
+              x0r = y3r - y7i
+              x0i = y3i + y7r
+              a(j + 6) = wk3r * x0r - wk3i * x0i
+              a(j + 7) = wk3r * x0i + wk3i * x0r
+              x0r = y3r + y7i
+              x0i = y3i - y7r
+              a(j + 14) = wk7r * x0r - wk7i * x0i
+              a(j + 15) = wk7r * x0i + wk7i * x0r
+              a(j) = y0r + y4r
+              a(j + 1) = y0i + y4i
+              x0r = y0r - y4r
+              x0i = y0i - y4i
+              a(j + 8) = wk4r * x0r - wk4i * x0i
+              a(j + 9) = wk4r * x0i + wk4i * x0r
+              x0r = y2r - y6i
+              x0i = y2i + y6r
+              a(j + 4) = wk2r * x0r - wk2i * x0i
+              a(j + 5) = wk2r * x0i + wk2i * x0r
+              x0r = y2r + y6i
+              x0i = y2i - y6r
+              a(j + 12) = wk6r * x0r - wk6i * x0i
+              a(j + 13) = wk6r * x0i + wk6i * x0r
+          end do
+      end if
+      end
+!
+      subroutine cftmdl(n, l, a, w)
+      integer n, l, j, j1, j2, j3, j4, j5, j6, j7, k, k1, m
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 wn4r, wtmp, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+      real*8 wk4r, wk4i, wk5r, wk5i, wk6r, wk6i, wk7r, wk7i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+      real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
+      m = 8 * l
+      wn4r = w(2)
+      do j = 0, l - 2, 2
+          j1 = j + l
+          j2 = j1 + l
+          j3 = j2 + l
+          j4 = j3 + l
+          j5 = j4 + l
+          j6 = j5 + l
+          j7 = j6 + l
+          x0r = a(j) + a(j1)
+          x0i = a(j + 1) + a(j1 + 1)
+          x1r = a(j) - a(j1)
+          x1i = a(j + 1) - a(j1 + 1)
+          x2r = a(j2) + a(j3)
+          x2i = a(j2 + 1) + a(j3 + 1)
+          x3r = a(j2) - a(j3)
+          x3i = a(j2 + 1) - a(j3 + 1)
+          y0r = x0r + x2r
+          y0i = x0i + x2i
+          y2r = x0r - x2r
+          y2i = x0i - x2i
+          y1r = x1r - x3i
+          y1i = x1i + x3r
+          y3r = x1r + x3i
+          y3i = x1i - x3r
+          x0r = a(j4) + a(j5)
+          x0i = a(j4 + 1) + a(j5 + 1)
+          x1r = a(j4) - a(j5)
+          x1i = a(j4 + 1) - a(j5 + 1)
+          x2r = a(j6) + a(j7)
+          x2i = a(j6 + 1) + a(j7 + 1)
+          x3r = a(j6) - a(j7)
+          x3i = a(j6 + 1) - a(j7 + 1)
+          y4r = x0r + x2r
+          y4i = x0i + x2i
+          y6r = x0r - x2r
+          y6i = x0i - x2i
+          x0r = x1r - x3i
+          x0i = x1i + x3r
+          x2r = x1r + x3i
+          x2i = x1i - x3r
+          y5r = wn4r * (x0r - x0i)
+          y5i = wn4r * (x0r + x0i)
+          y7r = wn4r * (x2r - x2i)
+          y7i = wn4r * (x2r + x2i)
+          a(j1) = y1r + y5r
+          a(j1 + 1) = y1i + y5i
+          a(j5) = y1r - y5r
+          a(j5 + 1) = y1i - y5i
+          a(j3) = y3r - y7i
+          a(j3 + 1) = y3i + y7r
+          a(j7) = y3r + y7i
+          a(j7 + 1) = y3i - y7r
+          a(j) = y0r + y4r
+          a(j + 1) = y0i + y4i
+          a(j4) = y0r - y4r
+          a(j4 + 1) = y0i - y4i
+          a(j2) = y2r - y6i
+          a(j2 + 1) = y2i + y6r
+          a(j6) = y2r + y6i
+          a(j6 + 1) = y2i - y6r
+      end do
+      if (m .lt. n) then
+          wk1r = w(4)
+          wk1i = w(5)
+          do j = m, l + m - 2, 2
+              j1 = j + l
+              j2 = j1 + l
+              j3 = j2 + l
+              j4 = j3 + l
+              j5 = j4 + l
+              j6 = j5 + l
+              j7 = j6 + l
+              x0r = a(j) + a(j1)
+              x0i = a(j + 1) + a(j1 + 1)
+              x1r = a(j) - a(j1)
+              x1i = a(j + 1) - a(j1 + 1)
+              x2r = a(j2) + a(j3)
+              x2i = a(j2 + 1) + a(j3 + 1)
+              x3r = a(j2) - a(j3)
+              x3i = a(j2 + 1) - a(j3 + 1)
+              y0r = x0r + x2r
+              y0i = x0i + x2i
+              y2r = x0r - x2r
+              y2i = x0i - x2i
+              y1r = x1r - x3i
+              y1i = x1i + x3r
+              y3r = x1r + x3i
+              y3i = x1i - x3r
+              x0r = a(j4) + a(j5)
+              x0i = a(j4 + 1) + a(j5 + 1)
+              x1r = a(j4) - a(j5)
+              x1i = a(j4 + 1) - a(j5 + 1)
+              x2r = a(j6) + a(j7)
+              x2i = a(j6 + 1) + a(j7 + 1)
+              x3r = a(j6) - a(j7)
+              x3i = a(j6 + 1) - a(j7 + 1)
+              y4r = x0r + x2r
+              y4i = x0i + x2i
+              y6r = x0r - x2r
+              y6i = x0i - x2i
+              x0r = x1r - x3i
+              x0i = x1i + x3r
+              x2r = x1r + x3i
+              x2i = x3r - x1i
+              y5r = wk1i * x0r - wk1r * x0i
+              y5i = wk1i * x0i + wk1r * x0r
+              y7r = wk1r * x2r + wk1i * x2i
+              y7i = wk1r * x2i - wk1i * x2r
+              x0r = wk1r * y1r - wk1i * y1i
+              x0i = wk1r * y1i + wk1i * y1r
+              a(j1) = x0r + y5r
+              a(j1 + 1) = x0i + y5i
+              a(j5) = y5i - x0i
+              a(j5 + 1) = x0r - y5r
+              x0r = wk1i * y3r - wk1r * y3i
+              x0i = wk1i * y3i + wk1r * y3r
+              a(j3) = x0r - y7r
+              a(j3 + 1) = x0i + y7i
+              a(j7) = y7i - x0i
+              a(j7 + 1) = x0r + y7r
+              a(j) = y0r + y4r
+              a(j + 1) = y0i + y4i
+              a(j4) = y4i - y0i
+              a(j4 + 1) = y0r - y4r
+              x0r = y2r - y6i
+              x0i = y2i + y6r
+              a(j2) = wn4r * (x0r - x0i)
+              a(j2 + 1) = wn4r * (x0i + x0r)
+              x0r = y6r - y2i
+              x0i = y2r + y6i
+              a(j6) = wn4r * (x0r - x0i)
+              a(j6 + 1) = wn4r * (x0i + x0r)
+          end do
+          k1 = 4
+          do k = 2 * m, n - m, m
+              k1 = k1 + 4
+              wk1r = w(k1)
+              wk1i = w(k1 + 1)
+              wk2r = w(k1 + 2)
+              wk2i = w(k1 + 3)
+              wtmp = 2 * wk2i
+              wk3r = wk1r - wtmp * wk1i
+              wk3i = wtmp * wk1r - wk1i
+              wk4r = 1 - wtmp * wk2i
+              wk4i = wtmp * wk2r
+              wtmp = 2 * wk4i
+              wk5r = wk3r - wtmp * wk1i
+              wk5i = wtmp * wk1r - wk3i
+              wk6r = wk2r - wtmp * wk2i
+              wk6i = wtmp * wk2r - wk2i
+              wk7r = wk1r - wtmp * wk3i
+              wk7i = wtmp * wk3r - wk1i
+              do j = k, l + k - 2, 2
+                  j1 = j + l
+                  j2 = j1 + l
+                  j3 = j2 + l
+                  j4 = j3 + l
+                  j5 = j4 + l
+                  j6 = j5 + l
+                  j7 = j6 + l
+                  x0r = a(j) + a(j1)
+                  x0i = a(j + 1) + a(j1 + 1)
+                  x1r = a(j) - a(j1)
+                  x1i = a(j + 1) - a(j1 + 1)
+                  x2r = a(j2) + a(j3)
+                  x2i = a(j2 + 1) + a(j3 + 1)
+                  x3r = a(j2) - a(j3)
+                  x3i = a(j2 + 1) - a(j3 + 1)
+                  y0r = x0r + x2r
+                  y0i = x0i + x2i
+                  y2r = x0r - x2r
+                  y2i = x0i - x2i
+                  y1r = x1r - x3i
+                  y1i = x1i + x3r
+                  y3r = x1r + x3i
+                  y3i = x1i - x3r
+                  x0r = a(j4) + a(j5)
+                  x0i = a(j4 + 1) + a(j5 + 1)
+                  x1r = a(j4) - a(j5)
+                  x1i = a(j4 + 1) - a(j5 + 1)
+                  x2r = a(j6) + a(j7)
+                  x2i = a(j6 + 1) + a(j7 + 1)
+                  x3r = a(j6) - a(j7)
+                  x3i = a(j6 + 1) - a(j7 + 1)
+                  y4r = x0r + x2r
+                  y4i = x0i + x2i
+                  y6r = x0r - x2r
+                  y6i = x0i - x2i
+                  x0r = x1r - x3i
+                  x0i = x1i + x3r
+                  x2r = x1r + x3i
+                  x2i = x1i - x3r
+                  y5r = wn4r * (x0r - x0i)
+                  y5i = wn4r * (x0r + x0i)
+                  y7r = wn4r * (x2r - x2i)
+                  y7i = wn4r * (x2r + x2i)
+                  x0r = y1r + y5r
+                  x0i = y1i + y5i
+                  a(j1) = wk1r * x0r - wk1i * x0i
+                  a(j1 + 1) = wk1r * x0i + wk1i * x0r
+                  x0r = y1r - y5r
+                  x0i = y1i - y5i
+                  a(j5) = wk5r * x0r - wk5i * x0i
+                  a(j5 + 1) = wk5r * x0i + wk5i * x0r
+                  x0r = y3r - y7i
+                  x0i = y3i + y7r
+                  a(j3) = wk3r * x0r - wk3i * x0i
+                  a(j3 + 1) = wk3r * x0i + wk3i * x0r
+                  x0r = y3r + y7i
+                  x0i = y3i - y7r
+                  a(j7) = wk7r * x0r - wk7i * x0i
+                  a(j7 + 1) = wk7r * x0i + wk7i * x0r
+                  a(j) = y0r + y4r
+                  a(j + 1) = y0i + y4i
+                  x0r = y0r - y4r
+                  x0i = y0i - y4i
+                  a(j4) = wk4r * x0r - wk4i * x0i
+                  a(j4 + 1) = wk4r * x0i + wk4i * x0r
+                  x0r = y2r - y6i
+                  x0i = y2i + y6r
+                  a(j2) = wk2r * x0r - wk2i * x0i
+                  a(j2 + 1) = wk2r * x0i + wk2i * x0r
+                  x0r = y2r + y6i
+                  x0i = y2i - y6r
+                  a(j6) = wk6r * x0r - wk6i * x0i
+                  a(j6 + 1) = wk6r * x0i + wk6i * x0r
+              end do
+          end do
+      end if
+      end
+!
+      subroutine rftfsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr, xi, yr, yi
+      m = n / 2
+      ks = 2 * nc / m
+      kk = 0
+      do j = 2, m - 2, 2
+          k = n - j
+          kk = kk + ks
+          wkr = 0.5d0 - c(nc - kk)
+          wki = c(kk)
+          xr = a(j) - a(k)
+          xi = a(j + 1) + a(k + 1)
+          yr = wkr * xr - wki * xi
+          yi = wkr * xi + wki * xr
+          a(j) = a(j) - yr
+          a(j + 1) = a(j + 1) - yi
+          a(k) = a(k) + yr
+          a(k + 1) = a(k + 1) - yi
+      end do
+      end
+!
+      subroutine rftbsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr, xi, yr, yi
+      a(1) = -a(1)
+      m = n / 2
+      ks = 2 * nc / m
+      kk = 0
+      do j = 2, m - 2, 2
+          k = n - j
+          kk = kk + ks
+          wkr = 0.5d0 - c(nc - kk)
+          wki = c(kk)
+          xr = a(j) - a(k)
+          xi = a(j + 1) + a(k + 1)
+          yr = wkr * xr + wki * xi
+          yi = wkr * xi - wki * xr
+          a(j) = a(j) - yr
+          a(j + 1) = yi - a(j + 1)
+          a(k) = a(k) + yr
+          a(k + 1) = yi - a(k + 1)
+      end do
+      a(m + 1) = -a(m + 1)
+      end
+!
+      subroutine dctsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr
+      m = n / 2
+      ks = nc / n
+      kk = 0
+      do j = 1, m - 1
+          k = n - j
+          kk = kk + ks
+          wkr = c(kk) - c(nc - kk)
+          wki = c(kk) + c(nc - kk)
+          xr = wki * a(j) - wkr * a(k)
+          a(j) = wkr * a(j) + wki * a(k)
+          a(k) = xr
+      end do
+      a(m) = c(0) * a(m)
+      end
+!
+      subroutine dstsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr
+      m = n / 2
+      ks = nc / n
+      kk = 0
+      do j = 1, m - 1
+          k = n - j
+          kk = kk + ks
+          wkr = c(kk) - c(nc - kk)
+          wki = c(kk) + c(nc - kk)
+          xr = wki * a(k) - wkr * a(j)
+          a(k) = wkr * a(k) + wki * a(j)
+          a(j) = xr
+      end do
+      a(m) = c(0) * a(m)
+      end
+!
diff --git a/fft8g_h.c b/fft8g_h.c
new file mode 100644
index 0000000..4711a6e
--- /dev/null
+++ b/fft8g_h.c
@@ -0,0 +1,1659 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :8, 4, 2
+    data        :inplace
+    table       :not use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, double *);
+    void rdft(int, int, double *);
+    void ddct(int, int, double *);
+    void ddst(int, int, double *);
+    void dfct(int, double *);
+    void dfst(int, double *);
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            cdft(2*n, 1, a);
+        <case2>
+            cdft(2*n, -1, a);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (double *)
+                        input data
+                            a[2*j] = Re(x[j]), 
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]), 
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+    [remark]
+        Inverse of 
+            cdft(2*n, -1, a);
+        is 
+            cdft(2*n, 1, a);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + 
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + 
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            rdft(n, 1, a);
+        <case2>
+            rdft(n, -1, a);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+    [remark]
+        Inverse of 
+            rdft(n, 1, a);
+        is 
+            rdft(n, -1, a);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ddct(n, 1, a);
+        <case2>
+            ddct(n, -1, a);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+    [remark]
+        Inverse of 
+            ddct(n, -1, a);
+        is 
+            a[0] *= 0.5;
+            ddct(n, 1, a);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ddst(n, 1, a);
+        <case2>
+            ddst(n, -1, a);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+    [remark]
+        Inverse of 
+            ddst(n, -1, a);
+        is 
+            a[0] *= 0.5;
+            ddst(n, 1, a);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        dfct(n, a);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+    [remark]
+        Inverse of 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a);
+        is 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        dfst(n, a);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+    [remark]
+        Inverse of 
+            dfst(n, a);
+        is 
+            dfst(n, a);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+*/
+
+
+void cdft(int n, int isgn, double *a)
+{
+    void bitrv2(int n, double *a);
+    void bitrv2conj(int n, double *a);
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    
+    if (n > 4) {
+        if (isgn >= 0) {
+            bitrv2(n, a);
+            cftfsub(n, a);
+        } else {
+            bitrv2conj(n, a);
+            cftbsub(n, a);
+        }
+    } else if (n == 4) {
+        cftfsub(n, a);
+    }
+}
+
+
+void rdft(int n, int isgn, double *a)
+{
+    void bitrv2(int n, double *a);
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    void rftfsub(int n, double *a);
+    void rftbsub(int n, double *a);
+    double xi;
+    
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, a);
+            cftfsub(n, a);
+            rftfsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+        xi = a[0] - a[1];
+        a[0] += a[1];
+        a[1] = xi;
+    } else {
+        a[1] = 0.5 * (a[0] - a[1]);
+        a[0] -= a[1];
+        if (n > 4) {
+            rftbsub(n, a);
+            bitrv2(n, a);
+            cftbsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+    }
+}
+
+
+void ddct(int n, int isgn, double *a)
+{
+    void bitrv2(int n, double *a);
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    void rftfsub(int n, double *a);
+    void rftbsub(int n, double *a);
+    void dctsub(int n, double *a);
+    void dctsub4(int n, double *a);
+    int j;
+    double xr;
+    
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = a[j] - a[j - 1];
+            a[j] += a[j - 1];
+        }
+        a[1] = a[0] - xr;
+        a[0] += xr;
+        if (n > 4) {
+            rftbsub(n, a);
+            bitrv2(n, a);
+            cftbsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+    }
+    if (n > 4) {
+        dctsub(n, a);
+    } else {
+        dctsub4(n, a);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, a);
+            cftfsub(n, a);
+            rftfsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = a[j] - a[j + 1];
+            a[j] += a[j + 1];
+        }
+        a[n - 1] = xr;
+    }
+}
+
+
+void ddst(int n, int isgn, double *a)
+{
+    void bitrv2(int n, double *a);
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    void rftfsub(int n, double *a);
+    void rftbsub(int n, double *a);
+    void dstsub(int n, double *a);
+    void dstsub4(int n, double *a);
+    int j;
+    double xr;
+    
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = -a[j] - a[j - 1];
+            a[j] -= a[j - 1];
+        }
+        a[1] = a[0] + xr;
+        a[0] -= xr;
+        if (n > 4) {
+            rftbsub(n, a);
+            bitrv2(n, a);
+            cftbsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+    }
+    if (n > 4) {
+        dstsub(n, a);
+    } else {
+        dstsub4(n, a);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, a);
+            cftfsub(n, a);
+            rftfsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = -a[j] - a[j + 1];
+            a[j] -= a[j + 1];
+        }
+        a[n - 1] = -xr;
+    }
+}
+
+
+void dfct(int n, double *a)
+{
+    void ddct(int n, int isgn, double *a);
+    void bitrv1(int n, double *a);
+    int j, k, m, mh;
+    double xr, xi, yr, yi, an;
+    
+    m = n >> 1;
+    for (j = 0; j < m; j++) {
+        k = n - j;
+        xr = a[j] + a[k];
+        a[j] -= a[k];
+        a[k] = xr;
+    }
+    an = a[n];
+    while (m >= 2) {
+        ddct(m, 1, a);
+        bitrv1(m, a);
+        mh = m >> 1;
+        xi = a[m];
+        a[m] = a[0];
+        a[0] = an - xi;
+        an += xi;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[m + k];
+            xi = a[m + j];
+            yr = a[j];
+            yi = a[k];
+            a[m + j] = yr;
+            a[m + k] = yi;
+            a[j] = xr - xi;
+            a[k] = xr + xi;
+        }
+        xr = a[mh];
+        a[mh] = a[m + mh];
+        a[m + mh] = xr;
+        m = mh;
+    }
+    xi = a[1];
+    a[1] = a[0];
+    a[0] = an + xi;
+    a[n] = an - xi;
+    bitrv1(n, a);
+}
+
+
+void dfst(int n, double *a)
+{
+    void ddst(int n, int isgn, double *a);
+    void bitrv1(int n, double *a);
+    int j, k, m, mh;
+    double xr, xi, yr, yi;
+    
+    m = n >> 1;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        xr = a[j] - a[k];
+        a[j] += a[k];
+        a[k] = xr;
+    }
+    a[0] = a[m];
+    while (m >= 2) {
+        ddst(m, 1, a);
+        bitrv1(m, a);
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[m + k];
+            xi = a[m + j];
+            yr = a[j];
+            yi = a[k];
+            a[m + j] = yr;
+            a[m + k] = yi;
+            a[j] = xr + xi;
+            a[k] = xr - xi;
+        }
+        a[m] = a[0];
+        a[0] = a[m + mh];
+        a[m + mh] = a[mh];
+        m = mh;
+    }
+    a[1] = a[0];
+    a[0] = 0;
+    bitrv1(n, a);
+}
+
+
+/* -------- child routines -------- */
+
+
+#include <math.h>
+#ifndef M_PI_2
+#define M_PI_2      1.570796326794896619231321691639751442098584699687
+#endif
+#ifndef WR5000  /* cos(M_PI_2*0.5000) */
+#define WR5000      0.707106781186547524400844362104849039284835937688
+#endif
+#ifndef WR2500  /* cos(M_PI_2*0.2500) */
+#define WR2500      0.923879532511286756128183189396788286822416625863
+#endif
+#ifndef WI2500  /* sin(M_PI_2*0.2500) */
+#define WI2500      0.382683432365089771728459984030398866761344562485
+#endif
+
+
+#ifndef RDFT_LOOP_DIV  /* control of the RDFT's speed & tolerance */
+#define RDFT_LOOP_DIV 64
+#endif
+
+#ifndef DCST_LOOP_DIV  /* control of the DCT,DST's speed & tolerance */
+#define DCST_LOOP_DIV 64
+#endif
+
+
+void bitrv2(int n, double *a)
+{
+    int j0, k0, j1, k1, l, m, i, j, k;
+    double xr, xi, yr, yi;
+    
+    l = n >> 2;
+    m = 2;
+    while (m < l) {
+        l >>= 1;
+        m <<= 1;
+    }
+    if (m == l) {
+        j0 = 0;
+        for (k0 = 0; k0 < m; k0 += 2) {
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 2) {
+                xr = a[j];
+                xi = a[j + 1];
+                yr = a[k];
+                yi = a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 -= m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+            j1 = j0 + k0 + m;
+            k1 = j1 + m;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+        }
+    } else {
+        j0 = 0;
+        for (k0 = 2; k0 < m; k0 += 2) {
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 2) {
+                xr = a[j];
+                xi = a[j + 1];
+                yr = a[k];
+                yi = a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+        }
+    }
+}
+
+
+void bitrv2conj(int n, double *a)
+{
+    int j0, k0, j1, k1, l, m, i, j, k;
+    double xr, xi, yr, yi;
+    
+    l = n >> 2;
+    m = 2;
+    while (m < l) {
+        l >>= 1;
+        m <<= 1;
+    }
+    if (m == l) {
+        j0 = 0;
+        for (k0 = 0; k0 < m; k0 += 2) {
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 2) {
+                xr = a[j];
+                xi = -a[j + 1];
+                yr = a[k];
+                yi = -a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 -= m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+            k1 = j0 + k0;
+            a[k1 + 1] = -a[k1 + 1];
+            j1 = k1 + m;
+            k1 = j1 + m;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            k1 += m;
+            a[k1 + 1] = -a[k1 + 1];
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+        }
+    } else {
+        a[1] = -a[1];
+        a[m + 1] = -a[m + 1];
+        j0 = 0;
+        for (k0 = 2; k0 < m; k0 += 2) {
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 2) {
+                xr = a[j];
+                xi = -a[j + 1];
+                yr = a[k];
+                yi = -a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+            k1 = j0 + k0;
+            a[k1 + 1] = -a[k1 + 1];
+            a[k1 + m + 1] = -a[k1 + m + 1];
+        }
+    }
+}
+
+
+void bitrv1(int n, double *a)
+{
+    int j0, k0, j1, k1, l, m, i, j, k;
+    double x;
+    
+    l = n >> 2;
+    m = 1;
+    while (m < l) {
+        l >>= 1;
+        m <<= 1;
+    }
+    if (m == l) {
+        j0 = 0;
+        for (k0 = 0; k0 < m; k0++) {
+            k = k0;
+            for (j = j0; j < j0 + k0; j++) {
+                x = a[j];
+                a[j] = a[k];
+                a[k] = x;
+                j1 = j + m;
+                k1 = k + 2 * m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1 += m;
+                k1 -= m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1 += m;
+                k1 += 2 * m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+            j1 = j0 + k0 + m;
+            k1 = j1 + m;
+            x = a[j1];
+            a[j1] = a[k1];
+            a[k1] = x;
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+        }
+    } else {
+        j0 = 0;
+        for (k0 = 1; k0 < m; k0++) {
+            for (i = n >> 1; i > (j0 ^= i); i >>= 1);
+            k = k0;
+            for (j = j0; j < j0 + k0; j++) {
+                x = a[j];
+                a[j] = a[k];
+                a[k] = x;
+                j1 = j + m;
+                k1 = k + m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                for (i = n >> 1; i > (k ^= i); i >>= 1);
+            }
+        }
+    }
+}
+
+
+void cftfsub(int n, double *a)
+{
+    void cft1st(int n, double *a);
+    void cftmdl(int n, int l, double *a);
+    int j, j1, j2, j3, l;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    l = 2;
+    if (n >= 16) {
+        cft1st(n, a);
+        l = 16;
+        while ((l << 3) <= n) {
+            cftmdl(n, l, a);
+            l <<= 3;
+        }
+    }
+    if ((l << 1) < n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i - x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i + x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i - x3r;
+        }
+    } else if ((l << 1) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = a[j + 1] - a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] += a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+void cftbsub(int n, double *a)
+{
+    void cft1st(int n, double *a);
+    void cftmdl(int n, int l, double *a);
+    int j, j1, j2, j3, j4, j5, j6, j7, l;
+    double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    l = 2;
+    if (n > 16) {
+        cft1st(n, a);
+        l = 16;
+        while ((l << 3) < n) {
+            cftmdl(n, l, a);
+            l <<= 3;
+        }
+    }
+    if ((l << 2) < n) {
+        wn4r = WR5000;
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            j4 = j3 + l;
+            j5 = j4 + l;
+            j6 = j5 + l;
+            j7 = j6 + l;
+            x0r = a[j] + a[j1];
+            x0i = -a[j + 1] - a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = -a[j + 1] + a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            y0r = x0r + x2r;
+            y0i = x0i - x2i;
+            y2r = x0r - x2r;
+            y2i = x0i + x2i;
+            y1r = x1r - x3i;
+            y1i = x1i - x3r;
+            y3r = x1r + x3i;
+            y3i = x1i + x3r;
+            x0r = a[j4] + a[j5];
+            x0i = a[j4 + 1] + a[j5 + 1];
+            x1r = a[j4] - a[j5];
+            x1i = a[j4 + 1] - a[j5 + 1];
+            x2r = a[j6] + a[j7];
+            x2i = a[j6 + 1] + a[j7 + 1];
+            x3r = a[j6] - a[j7];
+            x3i = a[j6 + 1] - a[j7 + 1];
+            y4r = x0r + x2r;
+            y4i = x0i + x2i;
+            y6r = x0r - x2r;
+            y6i = x0i - x2i;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            x2r = x1r + x3i;
+            x2i = x1i - x3r;
+            y5r = wn4r * (x0r - x0i);
+            y5i = wn4r * (x0r + x0i);
+            y7r = wn4r * (x2r - x2i);
+            y7i = wn4r * (x2r + x2i);
+            a[j1] = y1r + y5r;
+            a[j1 + 1] = y1i - y5i;
+            a[j5] = y1r - y5r;
+            a[j5 + 1] = y1i + y5i;
+            a[j3] = y3r - y7i;
+            a[j3 + 1] = y3i - y7r;
+            a[j7] = y3r + y7i;
+            a[j7 + 1] = y3i + y7r;
+            a[j] = y0r + y4r;
+            a[j + 1] = y0i - y4i;
+            a[j4] = y0r - y4r;
+            a[j4 + 1] = y0i + y4i;
+            a[j2] = y2r - y6i;
+            a[j2 + 1] = y2i - y6r;
+            a[j6] = y2r + y6i;
+            a[j6 + 1] = y2i + y6r;
+        }
+    } else if ((l << 2) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = -a[j + 1] - a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = -a[j + 1] + a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i - x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i + x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i - x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i + x3r;
+        }
+    } else {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = -a[j + 1] + a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] = -a[j + 1] - a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+void cft1st(int n, double *a)
+{
+    int j, kj, kr;
+    double ew, wn4r, wtmp, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, 
+        wk4r, wk4i, wk5r, wk5i, wk6r, wk6i, wk7r, wk7i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    wn4r = WR5000;
+    x0r = a[0] + a[2];
+    x0i = a[1] + a[3];
+    x1r = a[0] - a[2];
+    x1i = a[1] - a[3];
+    x2r = a[4] + a[6];
+    x2i = a[5] + a[7];
+    x3r = a[4] - a[6];
+    x3i = a[5] - a[7];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y2r = x0r - x2r;
+    y2i = x0i - x2i;
+    y1r = x1r - x3i;
+    y1i = x1i + x3r;
+    y3r = x1r + x3i;
+    y3i = x1i - x3r;
+    x0r = a[8] + a[10];
+    x0i = a[9] + a[11];
+    x1r = a[8] - a[10];
+    x1i = a[9] - a[11];
+    x2r = a[12] + a[14];
+    x2i = a[13] + a[15];
+    x3r = a[12] - a[14];
+    x3i = a[13] - a[15];
+    y4r = x0r + x2r;
+    y4i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    x2r = x1r + x3i;
+    x2i = x1i - x3r;
+    y5r = wn4r * (x0r - x0i);
+    y5i = wn4r * (x0r + x0i);
+    y7r = wn4r * (x2r - x2i);
+    y7i = wn4r * (x2r + x2i);
+    a[2] = y1r + y5r;
+    a[3] = y1i + y5i;
+    a[10] = y1r - y5r;
+    a[11] = y1i - y5i;
+    a[6] = y3r - y7i;
+    a[7] = y3i + y7r;
+    a[14] = y3r + y7i;
+    a[15] = y3i - y7r;
+    a[0] = y0r + y4r;
+    a[1] = y0i + y4i;
+    a[8] = y0r - y4r;
+    a[9] = y0i - y4i;
+    a[4] = y2r - y6i;
+    a[5] = y2i + y6r;
+    a[12] = y2r + y6i;
+    a[13] = y2i - y6r;
+    if (n > 16) {
+        wk1r = WR2500;
+        wk1i = WI2500;
+        x0r = a[16] + a[18];
+        x0i = a[17] + a[19];
+        x1r = a[16] - a[18];
+        x1i = a[17] - a[19];
+        x2r = a[20] + a[22];
+        x2i = a[21] + a[23];
+        x3r = a[20] - a[22];
+        x3i = a[21] - a[23];
+        y0r = x0r + x2r;
+        y0i = x0i + x2i;
+        y2r = x0r - x2r;
+        y2i = x0i - x2i;
+        y1r = x1r - x3i;
+        y1i = x1i + x3r;
+        y3r = x1r + x3i;
+        y3i = x1i - x3r;
+        x0r = a[24] + a[26];
+        x0i = a[25] + a[27];
+        x1r = a[24] - a[26];
+        x1i = a[25] - a[27];
+        x2r = a[28] + a[30];
+        x2i = a[29] + a[31];
+        x3r = a[28] - a[30];
+        x3i = a[29] - a[31];
+        y4r = x0r + x2r;
+        y4i = x0i + x2i;
+        y6r = x0r - x2r;
+        y6i = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        x2r = x1r + x3i;
+        x2i = x3r - x1i;
+        y5r = wk1i * x0r - wk1r * x0i;
+        y5i = wk1i * x0i + wk1r * x0r;
+        y7r = wk1r * x2r + wk1i * x2i;
+        y7i = wk1r * x2i - wk1i * x2r;
+        x0r = wk1r * y1r - wk1i * y1i;
+        x0i = wk1r * y1i + wk1i * y1r;
+        a[18] = x0r + y5r;
+        a[19] = x0i + y5i;
+        a[26] = y5i - x0i;
+        a[27] = x0r - y5r;
+        x0r = wk1i * y3r - wk1r * y3i;
+        x0i = wk1i * y3i + wk1r * y3r;
+        a[22] = x0r - y7r;
+        a[23] = x0i + y7i;
+        a[30] = y7i - x0i;
+        a[31] = x0r + y7r;
+        a[16] = y0r + y4r;
+        a[17] = y0i + y4i;
+        a[24] = y4i - y0i;
+        a[25] = y0r - y4r;
+        x0r = y2r - y6i;
+        x0i = y2i + y6r;
+        a[20] = wn4r * (x0r - x0i);
+        a[21] = wn4r * (x0i + x0r);
+        x0r = y6r - y2i;
+        x0i = y2r + y6i;
+        a[28] = wn4r * (x0r - x0i);
+        a[29] = wn4r * (x0i + x0r);
+        ew = M_PI_2 / n;
+        kr = n >> 2;
+        for (j = 32; j < n; j += 16) {
+            for (kj = n >> 2; kj > (kr ^= kj); kj >>= 1);
+            wk1r = cos(ew * kr);
+            wk1i = sin(ew * kr);
+            wk2r = 1 - 2 * wk1i * wk1i;
+            wk2i = 2 * wk1i * wk1r;
+            wtmp = 2 * wk2i;
+            wk3r = wk1r - wtmp * wk1i;
+            wk3i = wtmp * wk1r - wk1i;
+            wk4r = 1 - wtmp * wk2i;
+            wk4i = wtmp * wk2r;
+            wtmp = 2 * wk4i;
+            wk5r = wk3r - wtmp * wk1i;
+            wk5i = wtmp * wk1r - wk3i;
+            wk6r = wk2r - wtmp * wk2i;
+            wk6i = wtmp * wk2r - wk2i;
+            wk7r = wk1r - wtmp * wk3i;
+            wk7i = wtmp * wk3r - wk1i;
+            x0r = a[j] + a[j + 2];
+            x0i = a[j + 1] + a[j + 3];
+            x1r = a[j] - a[j + 2];
+            x1i = a[j + 1] - a[j + 3];
+            x2r = a[j + 4] + a[j + 6];
+            x2i = a[j + 5] + a[j + 7];
+            x3r = a[j + 4] - a[j + 6];
+            x3i = a[j + 5] - a[j + 7];
+            y0r = x0r + x2r;
+            y0i = x0i + x2i;
+            y2r = x0r - x2r;
+            y2i = x0i - x2i;
+            y1r = x1r - x3i;
+            y1i = x1i + x3r;
+            y3r = x1r + x3i;
+            y3i = x1i - x3r;
+            x0r = a[j + 8] + a[j + 10];
+            x0i = a[j + 9] + a[j + 11];
+            x1r = a[j + 8] - a[j + 10];
+            x1i = a[j + 9] - a[j + 11];
+            x2r = a[j + 12] + a[j + 14];
+            x2i = a[j + 13] + a[j + 15];
+            x3r = a[j + 12] - a[j + 14];
+            x3i = a[j + 13] - a[j + 15];
+            y4r = x0r + x2r;
+            y4i = x0i + x2i;
+            y6r = x0r - x2r;
+            y6i = x0i - x2i;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            x2r = x1r + x3i;
+            x2i = x1i - x3r;
+            y5r = wn4r * (x0r - x0i);
+            y5i = wn4r * (x0r + x0i);
+            y7r = wn4r * (x2r - x2i);
+            y7i = wn4r * (x2r + x2i);
+            x0r = y1r + y5r;
+            x0i = y1i + y5i;
+            a[j + 2] = wk1r * x0r - wk1i * x0i;
+            a[j + 3] = wk1r * x0i + wk1i * x0r;
+            x0r = y1r - y5r;
+            x0i = y1i - y5i;
+            a[j + 10] = wk5r * x0r - wk5i * x0i;
+            a[j + 11] = wk5r * x0i + wk5i * x0r;
+            x0r = y3r - y7i;
+            x0i = y3i + y7r;
+            a[j + 6] = wk3r * x0r - wk3i * x0i;
+            a[j + 7] = wk3r * x0i + wk3i * x0r;
+            x0r = y3r + y7i;
+            x0i = y3i - y7r;
+            a[j + 14] = wk7r * x0r - wk7i * x0i;
+            a[j + 15] = wk7r * x0i + wk7i * x0r;
+            a[j] = y0r + y4r;
+            a[j + 1] = y0i + y4i;
+            x0r = y0r - y4r;
+            x0i = y0i - y4i;
+            a[j + 8] = wk4r * x0r - wk4i * x0i;
+            a[j + 9] = wk4r * x0i + wk4i * x0r;
+            x0r = y2r - y6i;
+            x0i = y2i + y6r;
+            a[j + 4] = wk2r * x0r - wk2i * x0i;
+            a[j + 5] = wk2r * x0i + wk2i * x0r;
+            x0r = y2r + y6i;
+            x0i = y2i - y6r;
+            a[j + 12] = wk6r * x0r - wk6i * x0i;
+            a[j + 13] = wk6r * x0i + wk6i * x0r;
+        }
+    }
+}
+
+
+void cftmdl(int n, int l, double *a)
+{
+    int j, j1, j2, j3, j4, j5, j6, j7, k, kj, kr, m;
+    double ew, wn4r, wtmp, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, 
+        wk4r, wk4i, wk5r, wk5i, wk6r, wk6i, wk7r, wk7i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    m = l << 3;
+    wn4r = WR5000;
+    for (j = 0; j < l; j += 2) {
+        j1 = j + l;
+        j2 = j1 + l;
+        j3 = j2 + l;
+        j4 = j3 + l;
+        j5 = j4 + l;
+        j6 = j5 + l;
+        j7 = j6 + l;
+        x0r = a[j] + a[j1];
+        x0i = a[j + 1] + a[j1 + 1];
+        x1r = a[j] - a[j1];
+        x1i = a[j + 1] - a[j1 + 1];
+        x2r = a[j2] + a[j3];
+        x2i = a[j2 + 1] + a[j3 + 1];
+        x3r = a[j2] - a[j3];
+        x3i = a[j2 + 1] - a[j3 + 1];
+        y0r = x0r + x2r;
+        y0i = x0i + x2i;
+        y2r = x0r - x2r;
+        y2i = x0i - x2i;
+        y1r = x1r - x3i;
+        y1i = x1i + x3r;
+        y3r = x1r + x3i;
+        y3i = x1i - x3r;
+        x0r = a[j4] + a[j5];
+        x0i = a[j4 + 1] + a[j5 + 1];
+        x1r = a[j4] - a[j5];
+        x1i = a[j4 + 1] - a[j5 + 1];
+        x2r = a[j6] + a[j7];
+        x2i = a[j6 + 1] + a[j7 + 1];
+        x3r = a[j6] - a[j7];
+        x3i = a[j6 + 1] - a[j7 + 1];
+        y4r = x0r + x2r;
+        y4i = x0i + x2i;
+        y6r = x0r - x2r;
+        y6i = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        x2r = x1r + x3i;
+        x2i = x1i - x3r;
+        y5r = wn4r * (x0r - x0i);
+        y5i = wn4r * (x0r + x0i);
+        y7r = wn4r * (x2r - x2i);
+        y7i = wn4r * (x2r + x2i);
+        a[j1] = y1r + y5r;
+        a[j1 + 1] = y1i + y5i;
+        a[j5] = y1r - y5r;
+        a[j5 + 1] = y1i - y5i;
+        a[j3] = y3r - y7i;
+        a[j3 + 1] = y3i + y7r;
+        a[j7] = y3r + y7i;
+        a[j7 + 1] = y3i - y7r;
+        a[j] = y0r + y4r;
+        a[j + 1] = y0i + y4i;
+        a[j4] = y0r - y4r;
+        a[j4 + 1] = y0i - y4i;
+        a[j2] = y2r - y6i;
+        a[j2 + 1] = y2i + y6r;
+        a[j6] = y2r + y6i;
+        a[j6 + 1] = y2i - y6r;
+    }
+    if (m < n) {
+        wk1r = WR2500;
+        wk1i = WI2500;
+        for (j = m; j < l + m; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            j4 = j3 + l;
+            j5 = j4 + l;
+            j6 = j5 + l;
+            j7 = j6 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            y0r = x0r + x2r;
+            y0i = x0i + x2i;
+            y2r = x0r - x2r;
+            y2i = x0i - x2i;
+            y1r = x1r - x3i;
+            y1i = x1i + x3r;
+            y3r = x1r + x3i;
+            y3i = x1i - x3r;
+            x0r = a[j4] + a[j5];
+            x0i = a[j4 + 1] + a[j5 + 1];
+            x1r = a[j4] - a[j5];
+            x1i = a[j4 + 1] - a[j5 + 1];
+            x2r = a[j6] + a[j7];
+            x2i = a[j6 + 1] + a[j7 + 1];
+            x3r = a[j6] - a[j7];
+            x3i = a[j6 + 1] - a[j7 + 1];
+            y4r = x0r + x2r;
+            y4i = x0i + x2i;
+            y6r = x0r - x2r;
+            y6i = x0i - x2i;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            x2r = x1r + x3i;
+            x2i = x3r - x1i;
+            y5r = wk1i * x0r - wk1r * x0i;
+            y5i = wk1i * x0i + wk1r * x0r;
+            y7r = wk1r * x2r + wk1i * x2i;
+            y7i = wk1r * x2i - wk1i * x2r;
+            x0r = wk1r * y1r - wk1i * y1i;
+            x0i = wk1r * y1i + wk1i * y1r;
+            a[j1] = x0r + y5r;
+            a[j1 + 1] = x0i + y5i;
+            a[j5] = y5i - x0i;
+            a[j5 + 1] = x0r - y5r;
+            x0r = wk1i * y3r - wk1r * y3i;
+            x0i = wk1i * y3i + wk1r * y3r;
+            a[j3] = x0r - y7r;
+            a[j3 + 1] = x0i + y7i;
+            a[j7] = y7i - x0i;
+            a[j7 + 1] = x0r + y7r;
+            a[j] = y0r + y4r;
+            a[j + 1] = y0i + y4i;
+            a[j4] = y4i - y0i;
+            a[j4 + 1] = y0r - y4r;
+            x0r = y2r - y6i;
+            x0i = y2i + y6r;
+            a[j2] = wn4r * (x0r - x0i);
+            a[j2 + 1] = wn4r * (x0i + x0r);
+            x0r = y6r - y2i;
+            x0i = y2r + y6i;
+            a[j6] = wn4r * (x0r - x0i);
+            a[j6 + 1] = wn4r * (x0i + x0r);
+        }
+        ew = M_PI_2 / n;
+        kr = n >> 2;
+        for (k = 2 * m; k < n; k += m) {
+            for (kj = n >> 2; kj > (kr ^= kj); kj >>= 1);
+            wk1r = cos(ew * kr);
+            wk1i = sin(ew * kr);
+            wk2r = 1 - 2 * wk1i * wk1i;
+            wk2i = 2 * wk1i * wk1r;
+            wtmp = 2 * wk2i;
+            wk3r = wk1r - wtmp * wk1i;
+            wk3i = wtmp * wk1r - wk1i;
+            wk4r = 1 - wtmp * wk2i;
+            wk4i = wtmp * wk2r;
+            wtmp = 2 * wk4i;
+            wk5r = wk3r - wtmp * wk1i;
+            wk5i = wtmp * wk1r - wk3i;
+            wk6r = wk2r - wtmp * wk2i;
+            wk6i = wtmp * wk2r - wk2i;
+            wk7r = wk1r - wtmp * wk3i;
+            wk7i = wtmp * wk3r - wk1i;
+            for (j = k; j < l + k; j += 2) {
+                j1 = j + l;
+                j2 = j1 + l;
+                j3 = j2 + l;
+                j4 = j3 + l;
+                j5 = j4 + l;
+                j6 = j5 + l;
+                j7 = j6 + l;
+                x0r = a[j] + a[j1];
+                x0i = a[j + 1] + a[j1 + 1];
+                x1r = a[j] - a[j1];
+                x1i = a[j + 1] - a[j1 + 1];
+                x2r = a[j2] + a[j3];
+                x2i = a[j2 + 1] + a[j3 + 1];
+                x3r = a[j2] - a[j3];
+                x3i = a[j2 + 1] - a[j3 + 1];
+                y0r = x0r + x2r;
+                y0i = x0i + x2i;
+                y2r = x0r - x2r;
+                y2i = x0i - x2i;
+                y1r = x1r - x3i;
+                y1i = x1i + x3r;
+                y3r = x1r + x3i;
+                y3i = x1i - x3r;
+                x0r = a[j4] + a[j5];
+                x0i = a[j4 + 1] + a[j5 + 1];
+                x1r = a[j4] - a[j5];
+                x1i = a[j4 + 1] - a[j5 + 1];
+                x2r = a[j6] + a[j7];
+                x2i = a[j6 + 1] + a[j7 + 1];
+                x3r = a[j6] - a[j7];
+                x3i = a[j6 + 1] - a[j7 + 1];
+                y4r = x0r + x2r;
+                y4i = x0i + x2i;
+                y6r = x0r - x2r;
+                y6i = x0i - x2i;
+                x0r = x1r - x3i;
+                x0i = x1i + x3r;
+                x2r = x1r + x3i;
+                x2i = x1i - x3r;
+                y5r = wn4r * (x0r - x0i);
+                y5i = wn4r * (x0r + x0i);
+                y7r = wn4r * (x2r - x2i);
+                y7i = wn4r * (x2r + x2i);
+                x0r = y1r + y5r;
+                x0i = y1i + y5i;
+                a[j1] = wk1r * x0r - wk1i * x0i;
+                a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+                x0r = y1r - y5r;
+                x0i = y1i - y5i;
+                a[j5] = wk5r * x0r - wk5i * x0i;
+                a[j5 + 1] = wk5r * x0i + wk5i * x0r;
+                x0r = y3r - y7i;
+                x0i = y3i + y7r;
+                a[j3] = wk3r * x0r - wk3i * x0i;
+                a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+                x0r = y3r + y7i;
+                x0i = y3i - y7r;
+                a[j7] = wk7r * x0r - wk7i * x0i;
+                a[j7 + 1] = wk7r * x0i + wk7i * x0r;
+                a[j] = y0r + y4r;
+                a[j + 1] = y0i + y4i;
+                x0r = y0r - y4r;
+                x0i = y0i - y4i;
+                a[j4] = wk4r * x0r - wk4i * x0i;
+                a[j4 + 1] = wk4r * x0i + wk4i * x0r;
+                x0r = y2r - y6i;
+                x0i = y2i + y6r;
+                a[j2] = wk2r * x0r - wk2i * x0i;
+                a[j2 + 1] = wk2r * x0i + wk2i * x0r;
+                x0r = y2r + y6i;
+                x0i = y2i - y6r;
+                a[j6] = wk6r * x0r - wk6i * x0i;
+                a[j6 + 1] = wk6r * x0i + wk6i * x0r;
+            }
+        }
+    }
+}
+
+
+void rftfsub(int n, double *a)
+{
+    int i, i0, j, k;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = 2 * M_PI_2 / n;
+    wkr = 0;
+    wki = 0;
+    wdi = cos(ec);
+    wdr = sin(ec);
+    wdi *= wdr;
+    wdr *= wdr;
+    w1r = 1 - 2 * wdr;
+    w1i = 2 * wdi;
+    ss = 2 * w1i;
+    i = n >> 1;
+    for (;;) {
+        i0 = i - 4 * RDFT_LOOP_DIV;
+        if (i0 < 4) {
+            i0 = 4;
+        }
+        for (j = i - 4; j >= i0; j -= 4) {
+            k = n - j;
+            xr = a[j + 2] - a[k - 2];
+            xi = a[j + 3] + a[k - 1];
+            yr = wdr * xr - wdi * xi;
+            yi = wdr * xi + wdi * xr;
+            a[j + 2] -= yr;
+            a[j + 3] -= yi;
+            a[k - 2] += yr;
+            a[k - 1] -= yi;
+            wkr += ss * wdi;
+            wki += ss * (0.5 - wdr);
+            xr = a[j] - a[k];
+            xi = a[j + 1] + a[k + 1];
+            yr = wkr * xr - wki * xi;
+            yi = wkr * xi + wki * xr;
+            a[j] -= yr;
+            a[j + 1] -= yi;
+            a[k] += yr;
+            a[k + 1] -= yi;
+            wdr += ss * wki;
+            wdi += ss * (0.5 - wkr);
+        }
+        if (i0 == 4) {
+            break;
+        }
+        wkr = 0.5 * sin(ec * i0);
+        wki = 0.5 * cos(ec * i0);
+        wdr = 0.5 - (wkr * w1r - wki * w1i);
+        wdi = wkr * w1i + wki * w1r;
+        wkr = 0.5 - wkr;
+        i = i0;
+    }
+    xr = a[2] - a[n - 2];
+    xi = a[3] + a[n - 1];
+    yr = wdr * xr - wdi * xi;
+    yi = wdr * xi + wdi * xr;
+    a[2] -= yr;
+    a[3] -= yi;
+    a[n - 2] += yr;
+    a[n - 1] -= yi;
+}
+
+
+void rftbsub(int n, double *a)
+{
+    int i, i0, j, k;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = 2 * M_PI_2 / n;
+    wkr = 0;
+    wki = 0;
+    wdi = cos(ec);
+    wdr = sin(ec);
+    wdi *= wdr;
+    wdr *= wdr;
+    w1r = 1 - 2 * wdr;
+    w1i = 2 * wdi;
+    ss = 2 * w1i;
+    i = n >> 1;
+    a[i + 1] = -a[i + 1];
+    for (;;) {
+        i0 = i - 4 * RDFT_LOOP_DIV;
+        if (i0 < 4) {
+            i0 = 4;
+        }
+        for (j = i - 4; j >= i0; j -= 4) {
+            k = n - j;
+            xr = a[j + 2] - a[k - 2];
+            xi = a[j + 3] + a[k - 1];
+            yr = wdr * xr + wdi * xi;
+            yi = wdr * xi - wdi * xr;
+            a[j + 2] -= yr;
+            a[j + 3] = yi - a[j + 3];
+            a[k - 2] += yr;
+            a[k - 1] = yi - a[k - 1];
+            wkr += ss * wdi;
+            wki += ss * (0.5 - wdr);
+            xr = a[j] - a[k];
+            xi = a[j + 1] + a[k + 1];
+            yr = wkr * xr + wki * xi;
+            yi = wkr * xi - wki * xr;
+            a[j] -= yr;
+            a[j + 1] = yi - a[j + 1];
+            a[k] += yr;
+            a[k + 1] = yi - a[k + 1];
+            wdr += ss * wki;
+            wdi += ss * (0.5 - wkr);
+        }
+        if (i0 == 4) {
+            break;
+        }
+        wkr = 0.5 * sin(ec * i0);
+        wki = 0.5 * cos(ec * i0);
+        wdr = 0.5 - (wkr * w1r - wki * w1i);
+        wdi = wkr * w1i + wki * w1r;
+        wkr = 0.5 - wkr;
+        i = i0;
+    }
+    xr = a[2] - a[n - 2];
+    xi = a[3] + a[n - 1];
+    yr = wdr * xr + wdi * xi;
+    yi = wdr * xi - wdi * xr;
+    a[2] -= yr;
+    a[3] = yi - a[3];
+    a[n - 2] += yr;
+    a[n - 1] = yi - a[n - 1];
+    a[1] = -a[1];
+}
+
+
+void dctsub(int n, double *a)
+{
+    int i, i0, j, k, m;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = M_PI_2 / n;
+    wkr = 0.5;
+    wki = 0.5;
+    w1r = cos(ec);
+    w1i = sin(ec);
+    wdr = 0.5 * (w1r - w1i);
+    wdi = 0.5 * (w1r + w1i);
+    ss = 2 * w1i;
+    m = n >> 1;
+    i = 0;
+    for (;;) {
+        i0 = i + 2 * DCST_LOOP_DIV;
+        if (i0 > m - 2) {
+            i0 = m - 2;
+        }
+        for (j = i + 2; j <= i0; j += 2) {
+            k = n - j;
+            xr = wdi * a[j - 1] - wdr * a[k + 1];
+            xi = wdr * a[j - 1] + wdi * a[k + 1];
+            wkr -= ss * wdi;
+            wki += ss * wdr;
+            yr = wki * a[j] - wkr * a[k];
+            yi = wkr * a[j] + wki * a[k];
+            wdr -= ss * wki;
+            wdi += ss * wkr;
+            a[k + 1] = xr;
+            a[k] = yr;
+            a[j - 1] = xi;
+            a[j] = yi;
+        }
+        if (i0 == m - 2) {
+            break;
+        }
+        wdr = cos(ec * i0);
+        wdi = sin(ec * i0);
+        wkr = 0.5 * (wdr - wdi);
+        wki = 0.5 * (wdr + wdi);
+        wdr = wkr * w1r - wki * w1i;
+        wdi = wkr * w1i + wki * w1r;
+        i = i0;
+    }
+    xr = wdi * a[m - 1] - wdr * a[m + 1];
+    a[m - 1] = wdr * a[m - 1] + wdi * a[m + 1];
+    a[m + 1] = xr;
+    a[m] *= wki + ss * wdr;
+}
+
+
+void dstsub(int n, double *a)
+{
+    int i, i0, j, k, m;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = M_PI_2 / n;
+    wkr = 0.5;
+    wki = 0.5;
+    w1r = cos(ec);
+    w1i = sin(ec);
+    wdr = 0.5 * (w1r - w1i);
+    wdi = 0.5 * (w1r + w1i);
+    ss = 2 * w1i;
+    m = n >> 1;
+    i = 0;
+    for (;;) {
+        i0 = i + 2 * DCST_LOOP_DIV;
+        if (i0 > m - 2) {
+            i0 = m - 2;
+        }
+        for (j = i + 2; j <= i0; j += 2) {
+            k = n - j;
+            xr = wdi * a[k + 1] - wdr * a[j - 1];
+            xi = wdr * a[k + 1] + wdi * a[j - 1];
+            wkr -= ss * wdi;
+            wki += ss * wdr;
+            yr = wki * a[k] - wkr * a[j];
+            yi = wkr * a[k] + wki * a[j];
+            wdr -= ss * wki;
+            wdi += ss * wkr;
+            a[j - 1] = xr;
+            a[j] = yr;
+            a[k + 1] = xi;
+            a[k] = yi;
+        }
+        if (i0 == m - 2) {
+            break;
+        }
+        wdr = cos(ec * i0);
+        wdi = sin(ec * i0);
+        wkr = 0.5 * (wdr - wdi);
+        wki = 0.5 * (wdr + wdi);
+        wdr = wkr * w1r - wki * w1i;
+        wdi = wkr * w1i + wki * w1r;
+        i = i0;
+    }
+    xr = wdi * a[m + 1] - wdr * a[m - 1];
+    a[m + 1] = wdr * a[m + 1] + wdi * a[m - 1];
+    a[m - 1] = xr;
+    a[m] *= wki + ss * wdr;
+}
+
+
+void dctsub4(int n, double *a)
+{
+    int m;
+    double wki, wdr, wdi, xr;
+    
+    wki = WR5000;
+    m = n >> 1;
+    if (m == 2) {
+        wdr = wki * WI2500;
+        wdi = wki * WR2500;
+        xr = wdi * a[1] - wdr * a[3];
+        a[1] = wdr * a[1] + wdi * a[3];
+        a[3] = xr;
+    }
+    a[m] *= wki;
+}
+
+
+void dstsub4(int n, double *a)
+{
+    int m;
+    double wki, wdr, wdi, xr;
+    
+    wki = WR5000;
+    m = n >> 1;
+    if (m == 2) {
+        wdr = wki * WI2500;
+        wdi = wki * WR2500;
+        xr = wdi * a[3] - wdr * a[1];
+        a[3] = wdr * a[3] + wdi * a[1];
+        a[1] = xr;
+    }
+    a[m] *= wki;
+}
+
diff --git a/fftsg.c b/fftsg.c
new file mode 100644
index 0000000..43d7534
--- /dev/null
+++ b/fftsg.c
@@ -0,0 +1,3314 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :split-radix
+    data        :inplace
+    table       :use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, double *, int *, double *);
+    void rdft(int, int, double *, int *, double *);
+    void ddct(int, int, double *, int *, double *);
+    void ddst(int, int, double *, int *, double *);
+    void dfct(int, double *, double *, int *, double *);
+    void dfst(int, double *, double *, int *, double *);
+macro definitions
+    USE_CDFT_PTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=8192
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536
+    USE_CDFT_WINTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=32768
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft(2*n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft(2*n, -1, a, ip, w);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (double *)
+                        input data
+                            a[2*j] = Re(x[j]), 
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]), 
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            cdft(2*n, -1, a, ip, w);
+        is 
+            cdft(2*n, 1, a, ip, w);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + 
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + 
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            rdft(n, 1, a, ip, w);
+        is 
+            rdft(n, -1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddct(n, -1, a, ip, w);
+        is 
+            a[0] *= 0.5;
+            ddct(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddst(n, -1, a, ip, w);
+        is 
+            a[0] *= 0.5;
+            ddst(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        ip[0] = 0; // first time only
+        dfct(n, a, t, ip, w);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+        t[0...n/2]     :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+        is 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        ip[0] = 0; // first time only
+        dfst(n, a, t, ip, w);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+        t[0...n/2-1]   :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            dfst(n, a, t, ip, w);
+        is 
+            dfst(n, a, t, ip, w);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+Appendix :
+    The cos/sin table is recalculated when the larger table required.
+    w[] and ip[] are compatible with all routines.
+*/
+
+
+void cdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    int nw;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    if (isgn >= 0) {
+        cftfsub(n, a, ip, nw, w);
+    } else {
+        cftbsub(n, a, ip, nw, w);
+    }
+}
+
+
+void rdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    int nw, nc;
+    double xi;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 2)) {
+        nc = n >> 2;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a, ip, nw, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, ip, nw, w);
+        }
+        xi = a[0] - a[1];
+        a[0] += a[1];
+        a[1] = xi;
+    } else {
+        a[1] = 0.5 * (a[0] - a[1]);
+        a[0] -= a[1];
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            cftbsub(n, a, ip, nw, w);
+        } else if (n == 4) {
+            cftbsub(n, a, ip, nw, w);
+        }
+    }
+}
+
+
+void ddct(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    void dctsub(int n, double *a, int nc, double *c);
+    int j, nw, nc;
+    double xr;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = a[j] - a[j - 1];
+            a[j] += a[j - 1];
+        }
+        a[1] = a[0] - xr;
+        a[0] += xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            cftbsub(n, a, ip, nw, w);
+        } else if (n == 4) {
+            cftbsub(n, a, ip, nw, w);
+        }
+    }
+    dctsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a, ip, nw, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, ip, nw, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = a[j] - a[j + 1];
+            a[j] += a[j + 1];
+        }
+        a[n - 1] = xr;
+    }
+}
+
+
+void ddst(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    void dstsub(int n, double *a, int nc, double *c);
+    int j, nw, nc;
+    double xr;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = -a[j] - a[j - 1];
+            a[j] -= a[j - 1];
+        }
+        a[1] = a[0] + xr;
+        a[0] -= xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            cftbsub(n, a, ip, nw, w);
+        } else if (n == 4) {
+            cftbsub(n, a, ip, nw, w);
+        }
+    }
+    dstsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a, ip, nw, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, ip, nw, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = -a[j] - a[j + 1];
+            a[j] -= a[j + 1];
+        }
+        a[n - 1] = -xr;
+    }
+}
+
+
+void dfct(int n, double *a, double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void dctsub(int n, double *a, int nc, double *c);
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+    
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    m = n >> 1;
+    yi = a[m];
+    xi = a[0] + a[n];
+    a[0] -= a[n];
+    t[0] = xi - yi;
+    t[m] = xi + yi;
+    if (n > 2) {
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] - a[n - j];
+            xi = a[j] + a[n - j];
+            yr = a[k] - a[n - k];
+            yi = a[k] + a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi - yi;
+            t[k] = xi + yi;
+        }
+        t[mh] = a[mh] + a[n - mh];
+        a[mh] -= a[n - mh];
+        dctsub(m, a, nc, w + nw);
+        if (m > 4) {
+            cftfsub(m, a, ip, nw, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, ip, nw, w);
+        }
+        a[n - 1] = a[0] - a[1];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] + a[j + 1];
+            a[2 * j - 1] = a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dctsub(m, t, nc, w + nw);
+            if (m > 4) {
+                cftfsub(m, t, ip, nw, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, ip, nw, w);
+            }
+            a[n - l] = t[0] - t[1];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = t[j] - t[j + 1];
+                a[k + l] = t[j] + t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 0; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] - t[m + j];
+                t[k] = t[m + k] + t[m + j];
+            }
+            t[mh] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+        a[n] = t[2] - t[1];
+        a[0] = t[2] + t[1];
+    } else {
+        a[1] = a[0];
+        a[2] = t[0];
+        a[0] = t[1];
+    }
+}
+
+
+void dfst(int n, double *a, double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void dstsub(int n, double *a, int nc, double *c);
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+    
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    if (n > 2) {
+        m = n >> 1;
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] + a[n - j];
+            xi = a[j] - a[n - j];
+            yr = a[k] + a[n - k];
+            yi = a[k] - a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi + yi;
+            t[k] = xi - yi;
+        }
+        t[0] = a[mh] - a[n - mh];
+        a[mh] += a[n - mh];
+        a[0] = a[m];
+        dstsub(m, a, nc, w + nw);
+        if (m > 4) {
+            cftfsub(m, a, ip, nw, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, ip, nw, w);
+        }
+        a[n - 1] = a[1] - a[0];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] - a[j + 1];
+            a[2 * j - 1] = -a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dstsub(m, t, nc, w + nw);
+            if (m > 4) {
+                cftfsub(m, t, ip, nw, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, ip, nw, w);
+            }
+            a[n - l] = t[1] - t[0];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = -t[j] - t[j + 1];
+                a[k + l] = t[j] - t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 1; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] + t[m + j];
+                t[k] = t[m + k] - t[m + j];
+            }
+            t[0] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+    }
+    a[0] = 0;
+}
+
+
+/* -------- initializing routines -------- */
+
+
+#include <math.h>
+
+void makewt(int nw, int *ip, double *w)
+{
+    void makeipt(int nw, int *ip);
+    int j, nwh, nw0, nw1;
+    double delta, wn4r, wk1r, wk1i, wk3r, wk3i;
+    
+    ip[0] = nw;
+    ip[1] = 1;
+    if (nw > 2) {
+        nwh = nw >> 1;
+        delta = atan(1.0) / nwh;
+        wn4r = cos(delta * nwh);
+        w[0] = 1;
+        w[1] = wn4r;
+        if (nwh == 4) {
+            w[2] = cos(delta * 2);
+            w[3] = sin(delta * 2);
+        } else if (nwh > 4) {
+            makeipt(nw, ip);
+            w[2] = 0.5 / cos(delta * 2);
+            w[3] = 0.5 / cos(delta * 6);
+            for (j = 4; j < nwh; j += 4) {
+                w[j] = cos(delta * j);
+                w[j + 1] = sin(delta * j);
+                w[j + 2] = cos(3 * delta * j);
+                w[j + 3] = -sin(3 * delta * j);
+            }
+        }
+        nw0 = 0;
+        while (nwh > 2) {
+            nw1 = nw0 + nwh;
+            nwh >>= 1;
+            w[nw1] = 1;
+            w[nw1 + 1] = wn4r;
+            if (nwh == 4) {
+                wk1r = w[nw0 + 4];
+                wk1i = w[nw0 + 5];
+                w[nw1 + 2] = wk1r;
+                w[nw1 + 3] = wk1i;
+            } else if (nwh > 4) {
+                wk1r = w[nw0 + 4];
+                wk3r = w[nw0 + 6];
+                w[nw1 + 2] = 0.5 / wk1r;
+                w[nw1 + 3] = 0.5 / wk3r;
+                for (j = 4; j < nwh; j += 4) {
+                    wk1r = w[nw0 + 2 * j];
+                    wk1i = w[nw0 + 2 * j + 1];
+                    wk3r = w[nw0 + 2 * j + 2];
+                    wk3i = w[nw0 + 2 * j + 3];
+                    w[nw1 + j] = wk1r;
+                    w[nw1 + j + 1] = wk1i;
+                    w[nw1 + j + 2] = wk3r;
+                    w[nw1 + j + 3] = wk3i;
+                }
+            }
+            nw0 = nw1;
+        }
+    }
+}
+
+
+void makeipt(int nw, int *ip)
+{
+    int j, l, m, m2, p, q;
+    
+    ip[2] = 0;
+    ip[3] = 16;
+    m = 2;
+    for (l = nw; l > 32; l >>= 2) {
+        m2 = m << 1;
+        q = m2 << 3;
+        for (j = m; j < m2; j++) {
+            p = ip[j] << 2;
+            ip[m + j] = p;
+            ip[m2 + j] = p + q;
+        }
+        m = m2;
+    }
+}
+
+
+void makect(int nc, int *ip, double *c)
+{
+    int j, nch;
+    double delta;
+    
+    ip[1] = nc;
+    if (nc > 1) {
+        nch = nc >> 1;
+        delta = atan(1.0) / nch;
+        c[0] = cos(delta * nch);
+        c[nch] = 0.5 * c[0];
+        for (j = 1; j < nch; j++) {
+            c[j] = 0.5 * cos(delta * j);
+            c[nc - j] = 0.5 * sin(delta * j);
+        }
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+#ifdef USE_CDFT_PTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 8192
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t pthread_t
+#define cdft_thread_create(thp,func,argp) { \
+    if (pthread_create(thp, NULL, func, (void *) argp) != 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#define cdft_thread_wait(th) { \
+    if (pthread_join(th, NULL) != 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#endif /* USE_CDFT_PTHREADS */
+
+
+#ifdef USE_CDFT_WINTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 32768
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 524288
+#endif
+#include <windows.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t HANDLE
+#define cdft_thread_create(thp,func,argp) { \
+    DWORD thid; \
+    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \
+    if (*(thp) == 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#define cdft_thread_wait(th) { \
+    WaitForSingleObject(th, INFINITE); \
+    CloseHandle(th); \
+}
+#endif /* USE_CDFT_WINTHREADS */
+
+
+void cftfsub(int n, double *a, int *ip, int nw, double *w)
+{
+    void bitrv2(int n, int *ip, double *a);
+    void bitrv216(double *a);
+    void bitrv208(double *a);
+    void cftf1st(int n, double *a, double *w);
+    void cftrec4(int n, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftfx41(int n, double *a, int nw, double *w);
+    void cftf161(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf040(double *a);
+    void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+    void cftrec4_th(int n, double *a, int nw, double *w);
+#endif /* USE_CDFT_THREADS */
+    
+    if (n > 8) {
+        if (n > 32) {
+            cftf1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+            if (n > CDFT_THREADS_BEGIN_N) {
+                cftrec4_th(n, a, nw, w);
+            } else 
+#endif /* USE_CDFT_THREADS */
+            if (n > 512) {
+                cftrec4(n, a, nw, w);
+            } else if (n > 128) {
+                cftleaf(n, 1, a, nw, w);
+            } else {
+                cftfx41(n, a, nw, w);
+            }
+            bitrv2(n, ip, a);
+        } else if (n == 32) {
+            cftf161(a, &w[nw - 8]);
+            bitrv216(a);
+        } else {
+            cftf081(a, w);
+            bitrv208(a);
+        }
+    } else if (n == 8) {
+        cftf040(a);
+    } else if (n == 4) {
+        cftx020(a);
+    }
+}
+
+
+void cftbsub(int n, double *a, int *ip, int nw, double *w)
+{
+    void bitrv2conj(int n, int *ip, double *a);
+    void bitrv216neg(double *a);
+    void bitrv208neg(double *a);
+    void cftb1st(int n, double *a, double *w);
+    void cftrec4(int n, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftfx41(int n, double *a, int nw, double *w);
+    void cftf161(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftb040(double *a);
+    void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+    void cftrec4_th(int n, double *a, int nw, double *w);
+#endif /* USE_CDFT_THREADS */
+    
+    if (n > 8) {
+        if (n > 32) {
+            cftb1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+            if (n > CDFT_THREADS_BEGIN_N) {
+                cftrec4_th(n, a, nw, w);
+            } else 
+#endif /* USE_CDFT_THREADS */
+            if (n > 512) {
+                cftrec4(n, a, nw, w);
+            } else if (n > 128) {
+                cftleaf(n, 1, a, nw, w);
+            } else {
+                cftfx41(n, a, nw, w);
+            }
+            bitrv2conj(n, ip, a);
+        } else if (n == 32) {
+            cftf161(a, &w[nw - 8]);
+            bitrv216neg(a);
+        } else {
+            cftf081(a, w);
+            bitrv208neg(a);
+        }
+    } else if (n == 8) {
+        cftb040(a);
+    } else if (n == 4) {
+        cftx020(a);
+    }
+}
+
+
+void bitrv2(int n, int *ip, double *a)
+{
+    int j, j1, k, k1, l, m, nh, nm;
+    double xr, xi, yr, yi;
+    
+    m = 1;
+    for (l = n >> 2; l > 8; l >>= 2) {
+        m <<= 1;
+    }
+    nh = n >> 1;
+    nm = 4 * m;
+    if (l == 8) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + 2 * ip[m + k];
+                k1 = 4 * k + 2 * ip[m + j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + 2 * ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 += 2 * nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 -= nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= 2;
+            k1 -= nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nh + 2;
+            k1 += nh + 2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= nh - nm;
+            k1 += 2 * nm - 2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    } else {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + ip[m + k];
+                k1 = 4 * k + ip[m + j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 += nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    }
+}
+
+
+void bitrv2conj(int n, int *ip, double *a)
+{
+    int j, j1, k, k1, l, m, nh, nm;
+    double xr, xi, yr, yi;
+    
+    m = 1;
+    for (l = n >> 2; l > 8; l >>= 2) {
+        m <<= 1;
+    }
+    nh = n >> 1;
+    nm = 4 * m;
+    if (l == 8) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + 2 * ip[m + k];
+                k1 = 4 * k + 2 * ip[m + j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + 2 * ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            j1 += nm;
+            k1 += 2 * nm;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 -= nm;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= 2;
+            k1 -= nh;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nh + 2;
+            k1 += nh + 2;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= nh - nm;
+            k1 += 2 * nm - 2;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+        }
+    } else {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + ip[m + k];
+                k1 = 4 * k + ip[m + j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            j1 += nm;
+            k1 += nm;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+        }
+    }
+}
+
+
+void bitrv216(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
+        x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i, 
+        x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x7r = a[14];
+    x7i = a[15];
+    x8r = a[16];
+    x8i = a[17];
+    x10r = a[20];
+    x10i = a[21];
+    x11r = a[22];
+    x11i = a[23];
+    x12r = a[24];
+    x12i = a[25];
+    x13r = a[26];
+    x13i = a[27];
+    x14r = a[28];
+    x14i = a[29];
+    a[2] = x8r;
+    a[3] = x8i;
+    a[4] = x4r;
+    a[5] = x4i;
+    a[6] = x12r;
+    a[7] = x12i;
+    a[8] = x2r;
+    a[9] = x2i;
+    a[10] = x10r;
+    a[11] = x10i;
+    a[14] = x14r;
+    a[15] = x14i;
+    a[16] = x1r;
+    a[17] = x1i;
+    a[20] = x5r;
+    a[21] = x5i;
+    a[22] = x13r;
+    a[23] = x13i;
+    a[24] = x3r;
+    a[25] = x3i;
+    a[26] = x11r;
+    a[27] = x11i;
+    a[28] = x7r;
+    a[29] = x7i;
+}
+
+
+void bitrv216neg(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
+        x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i, 
+        x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, 
+        x13r, x13i, x14r, x14i, x15r, x15i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x6r = a[12];
+    x6i = a[13];
+    x7r = a[14];
+    x7i = a[15];
+    x8r = a[16];
+    x8i = a[17];
+    x9r = a[18];
+    x9i = a[19];
+    x10r = a[20];
+    x10i = a[21];
+    x11r = a[22];
+    x11i = a[23];
+    x12r = a[24];
+    x12i = a[25];
+    x13r = a[26];
+    x13i = a[27];
+    x14r = a[28];
+    x14i = a[29];
+    x15r = a[30];
+    x15i = a[31];
+    a[2] = x15r;
+    a[3] = x15i;
+    a[4] = x7r;
+    a[5] = x7i;
+    a[6] = x11r;
+    a[7] = x11i;
+    a[8] = x3r;
+    a[9] = x3i;
+    a[10] = x13r;
+    a[11] = x13i;
+    a[12] = x5r;
+    a[13] = x5i;
+    a[14] = x9r;
+    a[15] = x9i;
+    a[16] = x1r;
+    a[17] = x1i;
+    a[18] = x14r;
+    a[19] = x14i;
+    a[20] = x6r;
+    a[21] = x6i;
+    a[22] = x10r;
+    a[23] = x10i;
+    a[24] = x2r;
+    a[25] = x2i;
+    a[26] = x12r;
+    a[27] = x12i;
+    a[28] = x4r;
+    a[29] = x4i;
+    a[30] = x8r;
+    a[31] = x8i;
+}
+
+
+void bitrv208(double *a)
+{
+    double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x6r = a[12];
+    x6i = a[13];
+    a[2] = x4r;
+    a[3] = x4i;
+    a[6] = x6r;
+    a[7] = x6i;
+    a[8] = x1r;
+    a[9] = x1i;
+    a[12] = x3r;
+    a[13] = x3i;
+}
+
+
+void bitrv208neg(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
+        x5r, x5i, x6r, x6i, x7r, x7i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x6r = a[12];
+    x6i = a[13];
+    x7r = a[14];
+    x7i = a[15];
+    a[2] = x7r;
+    a[3] = x7i;
+    a[4] = x3r;
+    a[5] = x3i;
+    a[6] = x5r;
+    a[7] = x5i;
+    a[8] = x1r;
+    a[9] = x1i;
+    a[10] = x6r;
+    a[11] = x6i;
+    a[12] = x2r;
+    a[13] = x2i;
+    a[14] = x4r;
+    a[15] = x4i;
+}
+
+
+void cftf1st(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, 
+        wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
+    
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = a[1] + a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = a[1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    a[j2] = x1r - x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    csc1 = w[2];
+    csc3 = w[3];
+    wd1r = 1;
+    wd1i = 0;
+    wd3r = 1;
+    wd3i = 0;
+    k = 0;
+    for (j = 2; j < mh - 2; j += 4) {
+        k += 4;
+        wk1r = csc1 * (wd1r + w[k]);
+        wk1i = csc1 * (wd1i + w[k + 1]);
+        wk3r = csc3 * (wd3r + w[k + 2]);
+        wk3i = csc3 * (wd3i + w[k + 3]);
+        wd1r = w[k];
+        wd1i = w[k + 1];
+        wd3r = w[k + 2];
+        wd3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = a[j + 1] + a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = a[j + 1] - a[j2 + 1];
+        y0r = a[j + 2] + a[j2 + 2];
+        y0i = a[j + 3] + a[j2 + 3];
+        y1r = a[j + 2] - a[j2 + 2];
+        y1i = a[j + 3] - a[j2 + 3];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 + 2] + a[j3 + 2];
+        y2i = a[j1 + 3] + a[j3 + 3];
+        y3r = a[j1 + 2] - a[j3 + 2];
+        y3i = a[j1 + 3] - a[j3 + 3];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j + 2] = y0r + y2r;
+        a[j + 3] = y0i + y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        a[j1 + 2] = y0r - y2r;
+        a[j1 + 3] = y0i - y2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i + y3r;
+        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i - y3r;
+        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = a[j0 + 1] + a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = a[j0 + 1] - a[j2 + 1];
+        y0r = a[j0 - 2] + a[j2 - 2];
+        y0i = a[j0 - 1] + a[j2 - 1];
+        y1r = a[j0 - 2] - a[j2 - 2];
+        y1i = a[j0 - 1] - a[j2 - 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 - 2] + a[j3 - 2];
+        y2i = a[j1 - 1] + a[j3 - 1];
+        y3r = a[j1 - 2] - a[j3 - 2];
+        y3i = a[j1 - 1] - a[j3 - 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i + x2i;
+        a[j0 - 2] = y0r + y2r;
+        a[j0 - 1] = y0i + y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        a[j1 - 2] = y0r - y2r;
+        a[j1 - 1] = y0i - y2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i + y3r;
+        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i - y3r;
+        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+    }
+    wk1r = csc1 * (wd1r + wn4r);
+    wk1i = csc1 * (wd1i + wn4r);
+    wk3r = csc3 * (wd3r - wn4r);
+    wk3i = csc3 * (wd3i - wn4r);
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0 - 2] + a[j2 - 2];
+    x0i = a[j0 - 1] + a[j2 - 1];
+    x1r = a[j0 - 2] - a[j2 - 2];
+    x1i = a[j0 - 1] - a[j2 - 1];
+    x2r = a[j1 - 2] + a[j3 - 2];
+    x2i = a[j1 - 1] + a[j3 - 1];
+    x3r = a[j1 - 2] - a[j3 - 2];
+    x3i = a[j1 - 1] - a[j3 - 1];
+    a[j0 - 2] = x0r + x2r;
+    a[j0 - 1] = x0i + x2i;
+    a[j1 - 2] = x0r - x2r;
+    a[j1 - 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+    x0r = a[j0] + a[j2];
+    x0i = a[j0 + 1] + a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = a[j0 + 1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+    x0r = a[j0 + 2] + a[j2 + 2];
+    x0i = a[j0 + 3] + a[j2 + 3];
+    x1r = a[j0 + 2] - a[j2 + 2];
+    x1i = a[j0 + 3] - a[j2 + 3];
+    x2r = a[j1 + 2] + a[j3 + 2];
+    x2i = a[j1 + 3] + a[j3 + 3];
+    x3r = a[j1 + 2] - a[j3 + 2];
+    x3i = a[j1 + 3] - a[j3 + 3];
+    a[j0 + 2] = x0r + x2r;
+    a[j0 + 3] = x0i + x2i;
+    a[j1 + 2] = x0r - x2r;
+    a[j1 + 3] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+void cftb1st(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, 
+        wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
+    
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = -a[1] - a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = -a[1] + a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i - x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    a[j2] = x1r + x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r - x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    csc1 = w[2];
+    csc3 = w[3];
+    wd1r = 1;
+    wd1i = 0;
+    wd3r = 1;
+    wd3i = 0;
+    k = 0;
+    for (j = 2; j < mh - 2; j += 4) {
+        k += 4;
+        wk1r = csc1 * (wd1r + w[k]);
+        wk1i = csc1 * (wd1i + w[k + 1]);
+        wk3r = csc3 * (wd3r + w[k + 2]);
+        wk3i = csc3 * (wd3i + w[k + 3]);
+        wd1r = w[k];
+        wd1i = w[k + 1];
+        wd3r = w[k + 2];
+        wd3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = -a[j + 1] - a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = -a[j + 1] + a[j2 + 1];
+        y0r = a[j + 2] + a[j2 + 2];
+        y0i = -a[j + 3] - a[j2 + 3];
+        y1r = a[j + 2] - a[j2 + 2];
+        y1i = -a[j + 3] + a[j2 + 3];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 + 2] + a[j3 + 2];
+        y2i = a[j1 + 3] + a[j3 + 3];
+        y3r = a[j1 + 2] - a[j3 + 2];
+        y3i = a[j1 + 3] - a[j3 + 3];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i - x2i;
+        a[j + 2] = y0r + y2r;
+        a[j + 3] = y0i - y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i + x2i;
+        a[j1 + 2] = y0r - y2r;
+        a[j1 + 3] = y0i + y2i;
+        x0r = x1r + x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i + y3r;
+        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i - y3r;
+        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = -a[j0 + 1] - a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = -a[j0 + 1] + a[j2 + 1];
+        y0r = a[j0 - 2] + a[j2 - 2];
+        y0i = -a[j0 - 1] - a[j2 - 1];
+        y1r = a[j0 - 2] - a[j2 - 2];
+        y1i = -a[j0 - 1] + a[j2 - 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 - 2] + a[j3 - 2];
+        y2i = a[j1 - 1] + a[j3 - 1];
+        y3r = a[j1 - 2] - a[j3 - 2];
+        y3i = a[j1 - 1] - a[j3 - 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i - x2i;
+        a[j0 - 2] = y0r + y2r;
+        a[j0 - 1] = y0i - y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i + x2i;
+        a[j1 - 2] = y0r - y2r;
+        a[j1 - 1] = y0i + y2i;
+        x0r = x1r + x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i + y3r;
+        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i - y3r;
+        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+    }
+    wk1r = csc1 * (wd1r + wn4r);
+    wk1i = csc1 * (wd1i + wn4r);
+    wk3r = csc3 * (wd3r - wn4r);
+    wk3i = csc3 * (wd3i - wn4r);
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0 - 2] + a[j2 - 2];
+    x0i = -a[j0 - 1] - a[j2 - 1];
+    x1r = a[j0 - 2] - a[j2 - 2];
+    x1i = -a[j0 - 1] + a[j2 - 1];
+    x2r = a[j1 - 2] + a[j3 - 2];
+    x2i = a[j1 - 1] + a[j3 - 1];
+    x3r = a[j1 - 2] - a[j3 - 2];
+    x3i = a[j1 - 1] - a[j3 - 1];
+    a[j0 - 2] = x0r + x2r;
+    a[j0 - 1] = x0i - x2i;
+    a[j1 - 2] = x0r - x2r;
+    a[j1 - 1] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+    x0r = a[j0] + a[j2];
+    x0i = -a[j0 + 1] - a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = -a[j0 + 1] + a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i - x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+    x0r = a[j0 + 2] + a[j2 + 2];
+    x0i = -a[j0 + 3] - a[j2 + 3];
+    x1r = a[j0 + 2] - a[j2 + 2];
+    x1i = -a[j0 + 3] + a[j2 + 3];
+    x2r = a[j1 + 2] + a[j3 + 2];
+    x2i = a[j1 + 3] + a[j3 + 3];
+    x3r = a[j1 + 2] - a[j3 + 2];
+    x3i = a[j1 + 3] - a[j3 + 3];
+    a[j0 + 2] = x0r + x2r;
+    a[j0 + 3] = x0i - x2i;
+    a[j1 + 2] = x0r - x2r;
+    a[j1 + 3] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+#ifdef USE_CDFT_THREADS
+struct cdft_arg_st {
+    int n0;
+    int n;
+    double *a;
+    int nw;
+    double *w;
+};
+typedef struct cdft_arg_st cdft_arg_t;
+
+
+void cftrec4_th(int n, double *a, int nw, double *w)
+{
+    void *cftrec1_th(void *p);
+    void *cftrec2_th(void *p);
+    int i, idiv4, m, nthread;
+    cdft_thread_t th[4];
+    cdft_arg_t ag[4];
+    
+    nthread = 2;
+    idiv4 = 0;
+    m = n >> 1;
+    if (n > CDFT_4THREADS_BEGIN_N) {
+        nthread = 4;
+        idiv4 = 1;
+        m >>= 1;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].n0 = n;
+        ag[i].n = m;
+        ag[i].a = &a[i * m];
+        ag[i].nw = nw;
+        ag[i].w = w;
+        if (i != idiv4) {
+            cdft_thread_create(&th[i], cftrec1_th, &ag[i]);
+        } else {
+            cdft_thread_create(&th[i], cftrec2_th, &ag[i]);
+        }
+    }
+    for (i = 0; i < nthread; i++) {
+        cdft_thread_wait(th[i]);
+    }
+}
+
+
+void *cftrec1_th(void *p)
+{
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl1(int n, double *a, double *w);
+    int isplt, j, k, m, n, n0, nw;
+    double *a, *w;
+    
+    n0 = ((cdft_arg_t *) p)->n0;
+    n = ((cdft_arg_t *) p)->n;
+    a = ((cdft_arg_t *) p)->a;
+    nw = ((cdft_arg_t *) p)->nw;
+    w = ((cdft_arg_t *) p)->w;
+    m = n0;
+    while (m > 512) {
+        m >>= 2;
+        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+    }
+    cftleaf(m, 1, &a[n - m], nw, w);
+    k = 0;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+    return (void *) 0;
+}
+
+
+void *cftrec2_th(void *p)
+{
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    int isplt, j, k, m, n, n0, nw;
+    double *a, *w;
+    
+    n0 = ((cdft_arg_t *) p)->n0;
+    n = ((cdft_arg_t *) p)->n;
+    a = ((cdft_arg_t *) p)->a;
+    nw = ((cdft_arg_t *) p)->nw;
+    w = ((cdft_arg_t *) p)->w;
+    k = 1;
+    m = n0;
+    while (m > 512) {
+        m >>= 2;
+        k <<= 2;
+        cftmdl2(m, &a[n - m], &w[nw - m]);
+    }
+    cftleaf(m, 0, &a[n - m], nw, w);
+    k >>= 1;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+    return (void *) 0;
+}
+#endif /* USE_CDFT_THREADS */
+
+
+void cftrec4(int n, double *a, int nw, double *w)
+{
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl1(int n, double *a, double *w);
+    int isplt, j, k, m;
+    
+    m = n;
+    while (m > 512) {
+        m >>= 2;
+        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+    }
+    cftleaf(m, 1, &a[n - m], nw, w);
+    k = 0;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+}
+
+
+int cfttree(int n, int j, int k, double *a, int nw, double *w)
+{
+    void cftmdl1(int n, double *a, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    int i, isplt, m;
+    
+    if ((k & 3) != 0) {
+        isplt = k & 1;
+        if (isplt != 0) {
+            cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]);
+        } else {
+            cftmdl2(n, &a[j - n], &w[nw - n]);
+        }
+    } else {
+        m = n;
+        for (i = k; (i & 3) == 0; i >>= 2) {
+            m <<= 2;
+        }
+        isplt = i & 1;
+        if (isplt != 0) {
+            while (m > 128) {
+                cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]);
+                m >>= 2;
+            }
+        } else {
+            while (m > 128) {
+                cftmdl2(m, &a[j - m], &w[nw - m]);
+                m >>= 2;
+            }
+        }
+    }
+    return isplt;
+}
+
+
+void cftleaf(int n, int isplt, double *a, int nw, double *w)
+{
+    void cftmdl1(int n, double *a, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    void cftf161(double *a, double *w);
+    void cftf162(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf082(double *a, double *w);
+    
+    if (n == 512) {
+        cftmdl1(128, a, &w[nw - 64]);
+        cftf161(a, &w[nw - 8]);
+        cftf162(&a[32], &w[nw - 32]);
+        cftf161(&a[64], &w[nw - 8]);
+        cftf161(&a[96], &w[nw - 8]);
+        cftmdl2(128, &a[128], &w[nw - 128]);
+        cftf161(&a[128], &w[nw - 8]);
+        cftf162(&a[160], &w[nw - 32]);
+        cftf161(&a[192], &w[nw - 8]);
+        cftf162(&a[224], &w[nw - 32]);
+        cftmdl1(128, &a[256], &w[nw - 64]);
+        cftf161(&a[256], &w[nw - 8]);
+        cftf162(&a[288], &w[nw - 32]);
+        cftf161(&a[320], &w[nw - 8]);
+        cftf161(&a[352], &w[nw - 8]);
+        if (isplt != 0) {
+            cftmdl1(128, &a[384], &w[nw - 64]);
+            cftf161(&a[480], &w[nw - 8]);
+        } else {
+            cftmdl2(128, &a[384], &w[nw - 128]);
+            cftf162(&a[480], &w[nw - 32]);
+        }
+        cftf161(&a[384], &w[nw - 8]);
+        cftf162(&a[416], &w[nw - 32]);
+        cftf161(&a[448], &w[nw - 8]);
+    } else {
+        cftmdl1(64, a, &w[nw - 32]);
+        cftf081(a, &w[nw - 8]);
+        cftf082(&a[16], &w[nw - 8]);
+        cftf081(&a[32], &w[nw - 8]);
+        cftf081(&a[48], &w[nw - 8]);
+        cftmdl2(64, &a[64], &w[nw - 64]);
+        cftf081(&a[64], &w[nw - 8]);
+        cftf082(&a[80], &w[nw - 8]);
+        cftf081(&a[96], &w[nw - 8]);
+        cftf082(&a[112], &w[nw - 8]);
+        cftmdl1(64, &a[128], &w[nw - 32]);
+        cftf081(&a[128], &w[nw - 8]);
+        cftf082(&a[144], &w[nw - 8]);
+        cftf081(&a[160], &w[nw - 8]);
+        cftf081(&a[176], &w[nw - 8]);
+        if (isplt != 0) {
+            cftmdl1(64, &a[192], &w[nw - 32]);
+            cftf081(&a[240], &w[nw - 8]);
+        } else {
+            cftmdl2(64, &a[192], &w[nw - 64]);
+            cftf082(&a[240], &w[nw - 8]);
+        }
+        cftf081(&a[192], &w[nw - 8]);
+        cftf082(&a[208], &w[nw - 8]);
+        cftf081(&a[224], &w[nw - 8]);
+    }
+}
+
+
+void cftmdl1(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, wk1r, wk1i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = a[1] + a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = a[1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    a[j2] = x1r - x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    k = 0;
+    for (j = 2; j < mh; j += 2) {
+        k += 4;
+        wk1r = w[k];
+        wk1i = w[k + 1];
+        wk3r = w[k + 2];
+        wk3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = a[j + 1] + a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = a[j + 1] - a[j2 + 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = a[j0 + 1] + a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = a[j0 + 1] - a[j2 + 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i + x2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+    }
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] + a[j2];
+    x0i = a[j0 + 1] + a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = a[j0 + 1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+}
+
+
+void cftmdl2(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, kr, m, mh;
+    double wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i;
+    
+    mh = n >> 3;
+    m = 2 * mh;
+    wn4r = w[1];
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] - a[j2 + 1];
+    x0i = a[1] + a[j2];
+    x1r = a[0] + a[j2 + 1];
+    x1i = a[1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wn4r * (x2r - x2i);
+    y0i = wn4r * (x2i + x2r);
+    a[0] = x0r + y0r;
+    a[1] = x0i + y0i;
+    a[j1] = x0r - y0r;
+    a[j1 + 1] = x0i - y0i;
+    y0r = wn4r * (x3r - x3i);
+    y0i = wn4r * (x3i + x3r);
+    a[j2] = x1r - y0i;
+    a[j2 + 1] = x1i + y0r;
+    a[j3] = x1r + y0i;
+    a[j3 + 1] = x1i - y0r;
+    k = 0;
+    kr = 2 * m;
+    for (j = 2; j < mh; j += 2) {
+        k += 4;
+        wk1r = w[k];
+        wk1i = w[k + 1];
+        wk3r = w[k + 2];
+        wk3i = w[k + 3];
+        kr -= 4;
+        wd1i = w[kr];
+        wd1r = w[kr + 1];
+        wd3i = w[kr + 2];
+        wd3r = w[kr + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] - a[j2 + 1];
+        x0i = a[j + 1] + a[j2];
+        x1r = a[j] + a[j2 + 1];
+        x1i = a[j + 1] - a[j2];
+        x2r = a[j1] - a[j3 + 1];
+        x2i = a[j1 + 1] + a[j3];
+        x3r = a[j1] + a[j3 + 1];
+        x3i = a[j1 + 1] - a[j3];
+        y0r = wk1r * x0r - wk1i * x0i;
+        y0i = wk1r * x0i + wk1i * x0r;
+        y2r = wd1r * x2r - wd1i * x2i;
+        y2i = wd1r * x2i + wd1i * x2r;
+        a[j] = y0r + y2r;
+        a[j + 1] = y0i + y2i;
+        a[j1] = y0r - y2r;
+        a[j1 + 1] = y0i - y2i;
+        y0r = wk3r * x1r + wk3i * x1i;
+        y0i = wk3r * x1i - wk3i * x1r;
+        y2r = wd3r * x3r + wd3i * x3i;
+        y2i = wd3r * x3i - wd3i * x3r;
+        a[j2] = y0r + y2r;
+        a[j2 + 1] = y0i + y2i;
+        a[j3] = y0r - y2r;
+        a[j3 + 1] = y0i - y2i;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] - a[j2 + 1];
+        x0i = a[j0 + 1] + a[j2];
+        x1r = a[j0] + a[j2 + 1];
+        x1i = a[j0 + 1] - a[j2];
+        x2r = a[j1] - a[j3 + 1];
+        x2i = a[j1 + 1] + a[j3];
+        x3r = a[j1] + a[j3 + 1];
+        x3i = a[j1 + 1] - a[j3];
+        y0r = wd1i * x0r - wd1r * x0i;
+        y0i = wd1i * x0i + wd1r * x0r;
+        y2r = wk1i * x2r - wk1r * x2i;
+        y2i = wk1i * x2i + wk1r * x2r;
+        a[j0] = y0r + y2r;
+        a[j0 + 1] = y0i + y2i;
+        a[j1] = y0r - y2r;
+        a[j1 + 1] = y0i - y2i;
+        y0r = wd3i * x1r + wd3r * x1i;
+        y0i = wd3i * x1i - wd3r * x1r;
+        y2r = wk3i * x3r + wk3r * x3i;
+        y2i = wk3i * x3i - wk3r * x3r;
+        a[j2] = y0r + y2r;
+        a[j2 + 1] = y0i + y2i;
+        a[j3] = y0r - y2r;
+        a[j3 + 1] = y0i - y2i;
+    }
+    wk1r = w[m];
+    wk1i = w[m + 1];
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] - a[j2 + 1];
+    x0i = a[j0 + 1] + a[j2];
+    x1r = a[j0] + a[j2 + 1];
+    x1i = a[j0 + 1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wk1r * x0r - wk1i * x0i;
+    y0i = wk1r * x0i + wk1i * x0r;
+    y2r = wk1i * x2r - wk1r * x2i;
+    y2i = wk1i * x2i + wk1r * x2r;
+    a[j0] = y0r + y2r;
+    a[j0 + 1] = y0i + y2i;
+    a[j1] = y0r - y2r;
+    a[j1 + 1] = y0i - y2i;
+    y0r = wk1i * x1r - wk1r * x1i;
+    y0i = wk1i * x1i + wk1r * x1r;
+    y2r = wk1r * x3r - wk1i * x3i;
+    y2i = wk1r * x3i + wk1i * x3r;
+    a[j2] = y0r - y2r;
+    a[j2 + 1] = y0i - y2i;
+    a[j3] = y0r + y2r;
+    a[j3 + 1] = y0i + y2i;
+}
+
+
+void cftfx41(int n, double *a, int nw, double *w)
+{
+    void cftf161(double *a, double *w);
+    void cftf162(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf082(double *a, double *w);
+    
+    if (n == 128) {
+        cftf161(a, &w[nw - 8]);
+        cftf162(&a[32], &w[nw - 32]);
+        cftf161(&a[64], &w[nw - 8]);
+        cftf161(&a[96], &w[nw - 8]);
+    } else {
+        cftf081(a, &w[nw - 8]);
+        cftf082(&a[16], &w[nw - 8]);
+        cftf081(&a[32], &w[nw - 8]);
+        cftf081(&a[48], &w[nw - 8]);
+    }
+}
+
+
+void cftf161(double *a, double *w)
+{
+    double wn4r, wk1r, wk1i, 
+        x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, 
+        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, 
+        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+    
+    wn4r = w[1];
+    wk1r = w[2];
+    wk1i = w[3];
+    x0r = a[0] + a[16];
+    x0i = a[1] + a[17];
+    x1r = a[0] - a[16];
+    x1i = a[1] - a[17];
+    x2r = a[8] + a[24];
+    x2i = a[9] + a[25];
+    x3r = a[8] - a[24];
+    x3i = a[9] - a[25];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y4r = x0r - x2r;
+    y4i = x0i - x2i;
+    y8r = x1r - x3i;
+    y8i = x1i + x3r;
+    y12r = x1r + x3i;
+    y12i = x1i - x3r;
+    x0r = a[2] + a[18];
+    x0i = a[3] + a[19];
+    x1r = a[2] - a[18];
+    x1i = a[3] - a[19];
+    x2r = a[10] + a[26];
+    x2i = a[11] + a[27];
+    x3r = a[10] - a[26];
+    x3i = a[11] - a[27];
+    y1r = x0r + x2r;
+    y1i = x0i + x2i;
+    y5r = x0r - x2r;
+    y5i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y9r = wk1r * x0r - wk1i * x0i;
+    y9i = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y13r = wk1i * x0r - wk1r * x0i;
+    y13i = wk1i * x0i + wk1r * x0r;
+    x0r = a[4] + a[20];
+    x0i = a[5] + a[21];
+    x1r = a[4] - a[20];
+    x1i = a[5] - a[21];
+    x2r = a[12] + a[28];
+    x2i = a[13] + a[29];
+    x3r = a[12] - a[28];
+    x3i = a[13] - a[29];
+    y2r = x0r + x2r;
+    y2i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y10r = wn4r * (x0r - x0i);
+    y10i = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y14r = wn4r * (x0r + x0i);
+    y14i = wn4r * (x0i - x0r);
+    x0r = a[6] + a[22];
+    x0i = a[7] + a[23];
+    x1r = a[6] - a[22];
+    x1i = a[7] - a[23];
+    x2r = a[14] + a[30];
+    x2i = a[15] + a[31];
+    x3r = a[14] - a[30];
+    x3i = a[15] - a[31];
+    y3r = x0r + x2r;
+    y3i = x0i + x2i;
+    y7r = x0r - x2r;
+    y7i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y11r = wk1i * x0r - wk1r * x0i;
+    y11i = wk1i * x0i + wk1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y15r = wk1r * x0r - wk1i * x0i;
+    y15i = wk1r * x0i + wk1i * x0r;
+    x0r = y12r - y14r;
+    x0i = y12i - y14i;
+    x1r = y12r + y14r;
+    x1i = y12i + y14i;
+    x2r = y13r - y15r;
+    x2i = y13i - y15i;
+    x3r = y13r + y15r;
+    x3i = y13i + y15i;
+    a[24] = x0r + x2r;
+    a[25] = x0i + x2i;
+    a[26] = x0r - x2r;
+    a[27] = x0i - x2i;
+    a[28] = x1r - x3i;
+    a[29] = x1i + x3r;
+    a[30] = x1r + x3i;
+    a[31] = x1i - x3r;
+    x0r = y8r + y10r;
+    x0i = y8i + y10i;
+    x1r = y8r - y10r;
+    x1i = y8i - y10i;
+    x2r = y9r + y11r;
+    x2i = y9i + y11i;
+    x3r = y9r - y11r;
+    x3i = y9i - y11i;
+    a[16] = x0r + x2r;
+    a[17] = x0i + x2i;
+    a[18] = x0r - x2r;
+    a[19] = x0i - x2i;
+    a[20] = x1r - x3i;
+    a[21] = x1i + x3r;
+    a[22] = x1r + x3i;
+    a[23] = x1i - x3r;
+    x0r = y5r - y7i;
+    x0i = y5i + y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    x0r = y5r + y7i;
+    x0i = y5i - y7r;
+    x3r = wn4r * (x0r - x0i);
+    x3i = wn4r * (x0i + x0r);
+    x0r = y4r - y6i;
+    x0i = y4i + y6r;
+    x1r = y4r + y6i;
+    x1i = y4i - y6r;
+    a[8] = x0r + x2r;
+    a[9] = x0i + x2i;
+    a[10] = x0r - x2r;
+    a[11] = x0i - x2i;
+    a[12] = x1r - x3i;
+    a[13] = x1i + x3r;
+    a[14] = x1r + x3i;
+    a[15] = x1i - x3r;
+    x0r = y0r + y2r;
+    x0i = y0i + y2i;
+    x1r = y0r - y2r;
+    x1i = y0i - y2i;
+    x2r = y1r + y3r;
+    x2i = y1i + y3i;
+    x3r = y1r - y3r;
+    x3i = y1i - y3i;
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x0r - x2r;
+    a[3] = x0i - x2i;
+    a[4] = x1r - x3i;
+    a[5] = x1i + x3r;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+}
+
+
+void cftf162(double *a, double *w)
+{
+    double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, 
+        x0r, x0i, x1r, x1i, x2r, x2i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, 
+        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, 
+        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+    
+    wn4r = w[1];
+    wk1r = w[4];
+    wk1i = w[5];
+    wk3r = w[6];
+    wk3i = -w[7];
+    wk2r = w[8];
+    wk2i = w[9];
+    x1r = a[0] - a[17];
+    x1i = a[1] + a[16];
+    x0r = a[8] - a[25];
+    x0i = a[9] + a[24];
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    y0r = x1r + x2r;
+    y0i = x1i + x2i;
+    y4r = x1r - x2r;
+    y4i = x1i - x2i;
+    x1r = a[0] + a[17];
+    x1i = a[1] - a[16];
+    x0r = a[8] + a[25];
+    x0i = a[9] - a[24];
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    y8r = x1r - x2i;
+    y8i = x1i + x2r;
+    y12r = x1r + x2i;
+    y12i = x1i - x2r;
+    x0r = a[2] - a[19];
+    x0i = a[3] + a[18];
+    x1r = wk1r * x0r - wk1i * x0i;
+    x1i = wk1r * x0i + wk1i * x0r;
+    x0r = a[10] - a[27];
+    x0i = a[11] + a[26];
+    x2r = wk3i * x0r - wk3r * x0i;
+    x2i = wk3i * x0i + wk3r * x0r;
+    y1r = x1r + x2r;
+    y1i = x1i + x2i;
+    y5r = x1r - x2r;
+    y5i = x1i - x2i;
+    x0r = a[2] + a[19];
+    x0i = a[3] - a[18];
+    x1r = wk3r * x0r - wk3i * x0i;
+    x1i = wk3r * x0i + wk3i * x0r;
+    x0r = a[10] + a[27];
+    x0i = a[11] - a[26];
+    x2r = wk1r * x0r + wk1i * x0i;
+    x2i = wk1r * x0i - wk1i * x0r;
+    y9r = x1r - x2r;
+    y9i = x1i - x2i;
+    y13r = x1r + x2r;
+    y13i = x1i + x2i;
+    x0r = a[4] - a[21];
+    x0i = a[5] + a[20];
+    x1r = wk2r * x0r - wk2i * x0i;
+    x1i = wk2r * x0i + wk2i * x0r;
+    x0r = a[12] - a[29];
+    x0i = a[13] + a[28];
+    x2r = wk2i * x0r - wk2r * x0i;
+    x2i = wk2i * x0i + wk2r * x0r;
+    y2r = x1r + x2r;
+    y2i = x1i + x2i;
+    y6r = x1r - x2r;
+    y6i = x1i - x2i;
+    x0r = a[4] + a[21];
+    x0i = a[5] - a[20];
+    x1r = wk2i * x0r - wk2r * x0i;
+    x1i = wk2i * x0i + wk2r * x0r;
+    x0r = a[12] + a[29];
+    x0i = a[13] - a[28];
+    x2r = wk2r * x0r - wk2i * x0i;
+    x2i = wk2r * x0i + wk2i * x0r;
+    y10r = x1r - x2r;
+    y10i = x1i - x2i;
+    y14r = x1r + x2r;
+    y14i = x1i + x2i;
+    x0r = a[6] - a[23];
+    x0i = a[7] + a[22];
+    x1r = wk3r * x0r - wk3i * x0i;
+    x1i = wk3r * x0i + wk3i * x0r;
+    x0r = a[14] - a[31];
+    x0i = a[15] + a[30];
+    x2r = wk1i * x0r - wk1r * x0i;
+    x2i = wk1i * x0i + wk1r * x0r;
+    y3r = x1r + x2r;
+    y3i = x1i + x2i;
+    y7r = x1r - x2r;
+    y7i = x1i - x2i;
+    x0r = a[6] + a[23];
+    x0i = a[7] - a[22];
+    x1r = wk1i * x0r + wk1r * x0i;
+    x1i = wk1i * x0i - wk1r * x0r;
+    x0r = a[14] + a[31];
+    x0i = a[15] - a[30];
+    x2r = wk3i * x0r - wk3r * x0i;
+    x2i = wk3i * x0i + wk3r * x0r;
+    y11r = x1r + x2r;
+    y11i = x1i + x2i;
+    y15r = x1r - x2r;
+    y15i = x1i - x2i;
+    x1r = y0r + y2r;
+    x1i = y0i + y2i;
+    x2r = y1r + y3r;
+    x2i = y1i + y3i;
+    a[0] = x1r + x2r;
+    a[1] = x1i + x2i;
+    a[2] = x1r - x2r;
+    a[3] = x1i - x2i;
+    x1r = y0r - y2r;
+    x1i = y0i - y2i;
+    x2r = y1r - y3r;
+    x2i = y1i - y3i;
+    a[4] = x1r - x2i;
+    a[5] = x1i + x2r;
+    a[6] = x1r + x2i;
+    a[7] = x1i - x2r;
+    x1r = y4r - y6i;
+    x1i = y4i + y6r;
+    x0r = y5r - y7i;
+    x0i = y5i + y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[8] = x1r + x2r;
+    a[9] = x1i + x2i;
+    a[10] = x1r - x2r;
+    a[11] = x1i - x2i;
+    x1r = y4r + y6i;
+    x1i = y4i - y6r;
+    x0r = y5r + y7i;
+    x0i = y5i - y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[12] = x1r - x2i;
+    a[13] = x1i + x2r;
+    a[14] = x1r + x2i;
+    a[15] = x1i - x2r;
+    x1r = y8r + y10r;
+    x1i = y8i + y10i;
+    x2r = y9r - y11r;
+    x2i = y9i - y11i;
+    a[16] = x1r + x2r;
+    a[17] = x1i + x2i;
+    a[18] = x1r - x2r;
+    a[19] = x1i - x2i;
+    x1r = y8r - y10r;
+    x1i = y8i - y10i;
+    x2r = y9r + y11r;
+    x2i = y9i + y11i;
+    a[20] = x1r - x2i;
+    a[21] = x1i + x2r;
+    a[22] = x1r + x2i;
+    a[23] = x1i - x2r;
+    x1r = y12r - y14i;
+    x1i = y12i + y14r;
+    x0r = y13r + y15i;
+    x0i = y13i - y15r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[24] = x1r + x2r;
+    a[25] = x1i + x2i;
+    a[26] = x1r - x2r;
+    a[27] = x1i - x2i;
+    x1r = y12r + y14i;
+    x1i = y12i - y14r;
+    x0r = y13r - y15i;
+    x0i = y13i + y15r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[28] = x1r - x2i;
+    a[29] = x1i + x2r;
+    a[30] = x1r + x2i;
+    a[31] = x1i - x2r;
+}
+
+
+void cftf081(double *a, double *w)
+{
+    double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    wn4r = w[1];
+    x0r = a[0] + a[8];
+    x0i = a[1] + a[9];
+    x1r = a[0] - a[8];
+    x1i = a[1] - a[9];
+    x2r = a[4] + a[12];
+    x2i = a[5] + a[13];
+    x3r = a[4] - a[12];
+    x3i = a[5] - a[13];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y2r = x0r - x2r;
+    y2i = x0i - x2i;
+    y1r = x1r - x3i;
+    y1i = x1i + x3r;
+    y3r = x1r + x3i;
+    y3i = x1i - x3r;
+    x0r = a[2] + a[10];
+    x0i = a[3] + a[11];
+    x1r = a[2] - a[10];
+    x1i = a[3] - a[11];
+    x2r = a[6] + a[14];
+    x2i = a[7] + a[15];
+    x3r = a[6] - a[14];
+    x3i = a[7] - a[15];
+    y4r = x0r + x2r;
+    y4i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    x2r = x1r + x3i;
+    x2i = x1i - x3r;
+    y5r = wn4r * (x0r - x0i);
+    y5i = wn4r * (x0r + x0i);
+    y7r = wn4r * (x2r - x2i);
+    y7i = wn4r * (x2r + x2i);
+    a[8] = y1r + y5r;
+    a[9] = y1i + y5i;
+    a[10] = y1r - y5r;
+    a[11] = y1i - y5i;
+    a[12] = y3r - y7i;
+    a[13] = y3i + y7r;
+    a[14] = y3r + y7i;
+    a[15] = y3i - y7r;
+    a[0] = y0r + y4r;
+    a[1] = y0i + y4i;
+    a[2] = y0r - y4r;
+    a[3] = y0i - y4i;
+    a[4] = y2r - y6i;
+    a[5] = y2i + y6r;
+    a[6] = y2r + y6i;
+    a[7] = y2i - y6r;
+}
+
+
+void cftf082(double *a, double *w)
+{
+    double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    wn4r = w[1];
+    wk1r = w[2];
+    wk1i = w[3];
+    y0r = a[0] - a[9];
+    y0i = a[1] + a[8];
+    y1r = a[0] + a[9];
+    y1i = a[1] - a[8];
+    x0r = a[4] - a[13];
+    x0i = a[5] + a[12];
+    y2r = wn4r * (x0r - x0i);
+    y2i = wn4r * (x0i + x0r);
+    x0r = a[4] + a[13];
+    x0i = a[5] - a[12];
+    y3r = wn4r * (x0r - x0i);
+    y3i = wn4r * (x0i + x0r);
+    x0r = a[2] - a[11];
+    x0i = a[3] + a[10];
+    y4r = wk1r * x0r - wk1i * x0i;
+    y4i = wk1r * x0i + wk1i * x0r;
+    x0r = a[2] + a[11];
+    x0i = a[3] - a[10];
+    y5r = wk1i * x0r - wk1r * x0i;
+    y5i = wk1i * x0i + wk1r * x0r;
+    x0r = a[6] - a[15];
+    x0i = a[7] + a[14];
+    y6r = wk1i * x0r - wk1r * x0i;
+    y6i = wk1i * x0i + wk1r * x0r;
+    x0r = a[6] + a[15];
+    x0i = a[7] - a[14];
+    y7r = wk1r * x0r - wk1i * x0i;
+    y7i = wk1r * x0i + wk1i * x0r;
+    x0r = y0r + y2r;
+    x0i = y0i + y2i;
+    x1r = y4r + y6r;
+    x1i = y4i + y6i;
+    a[0] = x0r + x1r;
+    a[1] = x0i + x1i;
+    a[2] = x0r - x1r;
+    a[3] = x0i - x1i;
+    x0r = y0r - y2r;
+    x0i = y0i - y2i;
+    x1r = y4r - y6r;
+    x1i = y4i - y6i;
+    a[4] = x0r - x1i;
+    a[5] = x0i + x1r;
+    a[6] = x0r + x1i;
+    a[7] = x0i - x1r;
+    x0r = y1r - y3i;
+    x0i = y1i + y3r;
+    x1r = y5r - y7r;
+    x1i = y5i - y7i;
+    a[8] = x0r + x1r;
+    a[9] = x0i + x1i;
+    a[10] = x0r - x1r;
+    a[11] = x0i - x1i;
+    x0r = y1r + y3i;
+    x0i = y1i - y3r;
+    x1r = y5r + y7r;
+    x1i = y5i + y7i;
+    a[12] = x0r - x1i;
+    a[13] = x0i + x1r;
+    a[14] = x0r + x1i;
+    a[15] = x0i - x1r;
+}
+
+
+void cftf040(double *a)
+{
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    x0r = a[0] + a[4];
+    x0i = a[1] + a[5];
+    x1r = a[0] - a[4];
+    x1i = a[1] - a[5];
+    x2r = a[2] + a[6];
+    x2i = a[3] + a[7];
+    x3r = a[2] - a[6];
+    x3i = a[3] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x1r - x3i;
+    a[3] = x1i + x3r;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+}
+
+
+void cftb040(double *a)
+{
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    x0r = a[0] + a[4];
+    x0i = a[1] + a[5];
+    x1r = a[0] - a[4];
+    x1i = a[1] - a[5];
+    x2r = a[2] + a[6];
+    x2i = a[3] + a[7];
+    x3r = a[2] - a[6];
+    x3i = a[3] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x1r + x3i;
+    a[3] = x1i - x3r;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[6] = x1r - x3i;
+    a[7] = x1i + x3r;
+}
+
+
+void cftx020(double *a)
+{
+    double x0r, x0i;
+    
+    x0r = a[0] - a[2];
+    x0i = a[1] - a[3];
+    a[0] += a[2];
+    a[1] += a[3];
+    a[2] = x0r;
+    a[3] = x0i;
+}
+
+
+void rftfsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+    
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr - wki * xi;
+        yi = wkr * xi + wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+
+void rftbsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+    
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr + wki * xi;
+        yi = wkr * xi - wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+
+void dctsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+    
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[j] - wkr * a[k];
+        a[j] = wkr * a[j] + wki * a[k];
+        a[k] = xr;
+    }
+    a[m] *= c[0];
+}
+
+
+void dstsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+    
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[k] - wkr * a[j];
+        a[k] = wkr * a[k] + wki * a[j];
+        a[j] = xr;
+    }
+    a[m] *= c[0];
+}
+
diff --git a/fftsg.f b/fftsg.f
new file mode 100644
index 0000000..6fb26c4
--- /dev/null
+++ b/fftsg.f
@@ -0,0 +1,2967 @@
+! Fast Fourier/Cosine/Sine Transform
+!     dimension   :one
+!     data length :power of 2
+!     decimation  :frequency
+!     radix       :split-radix
+!     data        :inplace
+!     table       :use
+! subroutines
+!     cdft: Complex Discrete Fourier Transform
+!     rdft: Real Discrete Fourier Transform
+!     ddct: Discrete Cosine Transform
+!     ddst: Discrete Sine Transform
+!     dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+!     dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+!
+!
+! -------- Complex DFT (Discrete Fourier Transform) --------
+!     [definition]
+!         <case1>
+!             X(k) = sum_j=0^n-1 x(j)*exp(2*pi*i*j*k/n), 0<=k<n
+!         <case2>
+!             X(k) = sum_j=0^n-1 x(j)*exp(-2*pi*i*j*k/n), 0<=k<n
+!         (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call cdft(2*n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call cdft(2*n, -1, a, ip, w)
+!     [parameters]
+!         2*n          :data length (integer)
+!                       n >= 1, n = power of 2
+!         a(0:2*n-1)   :input/output data (real*8)
+!                       input data
+!                           a(2*j) = Re(x(j)), 
+!                           a(2*j+1) = Im(x(j)), 0<=j<n
+!                       output data
+!                           a(2*k) = Re(X(k)), 
+!                           a(2*k+1) = Im(X(k)), 0<=k<n
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n/2-1)   :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call cdft(2*n, -1, a, ip, w)
+!         is 
+!             call cdft(2*n, 1, a, ip, w)
+!             do j = 0, 2 * n - 1
+!                 a(j) = a(j) / n
+!             end do
+!         .
+!
+!
+! -------- Real DFT / Inverse of Real DFT --------
+!     [definition]
+!         <case1> RDFT
+!             R(k) = sum_j=0^n-1 a(j)*cos(2*pi*j*k/n), 0<=k<=n/2
+!             I(k) = sum_j=0^n-1 a(j)*sin(2*pi*j*k/n), 0<k<n/2
+!         <case2> IRDFT (excluding scale)
+!             a(k) = (R(0) + R(n/2)*cos(pi*k))/2 + 
+!                    sum_j=1^n/2-1 R(j)*cos(2*pi*j*k/n) + 
+!                    sum_j=1^n/2-1 I(j)*sin(2*pi*j*k/n), 0<=k<n
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call rdft(n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call rdft(n, -1, a, ip, w)
+!     [parameters]
+!         n            :data length (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       <case1>
+!                           output data
+!                               a(2*k) = R(k), 0<=k<n/2
+!                               a(2*k+1) = I(k), 0<k<n/2
+!                               a(1) = R(n/2)
+!                       <case2>
+!                           input data
+!                               a(2*j) = R(j), 0<=j<n/2
+!                               a(2*j+1) = I(j), 0<j<n/2
+!                               a(1) = R(n/2)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/2)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n/2-1)   :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call rdft(n, 1, a, ip, w)
+!         is 
+!             call rdft(n, -1, a, ip, w)
+!             do j = 0, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+!     [definition]
+!         <case1> IDCT (excluding scale)
+!             C(k) = sum_j=0^n-1 a(j)*cos(pi*j*(k+1/2)/n), 0<=k<n
+!         <case2> DCT
+!             C(k) = sum_j=0^n-1 a(j)*cos(pi*(j+1/2)*k/n), 0<=k<n
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddct(n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddct(n, -1, a, ip, w)
+!     [parameters]
+!         n            :data length (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       output data
+!                           a(k) = C(k), 0<=k<n
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/2)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/4-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddct(n, -1, a, ip, w)
+!         is 
+!             a(0) = a(0) / 2
+!             call ddct(n, 1, a, ip, w)
+!             do j = 0, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- DST (Discrete Sine Transform) / Inverse of DST --------
+!     [definition]
+!         <case1> IDST (excluding scale)
+!             S(k) = sum_j=1^n A(j)*sin(pi*j*(k+1/2)/n), 0<=k<n
+!         <case2> DST
+!             S(k) = sum_j=0^n-1 a(j)*sin(pi*(j+1/2)*k/n), 0<k<=n
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddst(n, 1, a, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddst(n, -1, a, ip, w)
+!     [parameters]
+!         n            :data length (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       <case1>
+!                           input data
+!                               a(j) = A(j), 0<j<n
+!                               a(0) = A(n)
+!                           output data
+!                               a(k) = S(k), 0<=k<n
+!                       <case2>
+!                           output data
+!                               a(k) = S(k), 0<k<n
+!                               a(0) = S(n)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/2)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/2+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/4-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddst(n, -1, a, ip, w)
+!         is 
+!             a(0) = a(0) / 2
+!             call ddst(n, 1, a, ip, w)
+!             do j = 0, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+!     [definition]
+!         C(k) = sum_j=0^n a(j)*cos(pi*j*k/n), 0<=k<=n
+!     [usage]
+!         ip(0) = 0  ! first time only
+!         call dfct(n, a, t, ip, w)
+!     [parameters]
+!         n            :data length - 1 (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n)       :input/output data (real*8)
+!                       output data
+!                           a(k) = C(k), 0<=k<=n
+!         t(0:n/2)     :work area (real*8)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/4)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/4+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/8-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             a(0) = a(0) / 2
+!             a(n) = a(n) / 2
+!             call dfct(n, a, t, ip, w)
+!         is 
+!             a(0) = a(0) / 2
+!             a(n) = a(n) / 2
+!             call dfct(n, a, t, ip, w)
+!             do j = 0, n
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! -------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+!     [definition]
+!         S(k) = sum_j=1^n-1 a(j)*sin(pi*j*k/n), 0<k<n
+!     [usage]
+!         ip(0) = 0  ! first time only
+!         call dfst(n, a, t, ip, w)
+!     [parameters]
+!         n            :data length + 1 (integer)
+!                       n >= 2, n = power of 2
+!         a(0:n-1)     :input/output data (real*8)
+!                       output data
+!                           a(k) = S(k), 0<k<n
+!                       (a(0) is used for work area)
+!         t(0:n/2-1)   :work area (real*8)
+!         ip(0:*)      :work area for bit reversal (integer)
+!                       length of ip >= 2+sqrt(n/4)
+!                       strictly, 
+!                       length of ip >= 
+!                           2+2**(int(log(n/4+0.5)/log(2.0))/2).
+!                       ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:n*5/8-1) :cos/sin table (real*8)
+!                       w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call dfst(n, a, t, ip, w)
+!         is 
+!             call dfst(n, a, t, ip, w)
+!             do j = 1, n - 1
+!                 a(j) = a(j) * 2 / n
+!             end do
+!         .
+!
+!
+! Appendix :
+!     The cos/sin table is recalculated when the larger table required.
+!     w() and ip() are compatible with all routines.
+!
+!
+      subroutine cdft(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *), nw
+      real*8 a(0 : n - 1), w(0 : *)
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      if (isgn .ge. 0) then
+          call cftfsub(n, a, ip, nw, w)
+      else
+          call cftbsub(n, a, ip, nw, w)
+      end if
+      end
+!
+      subroutine rdft(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *), nw, nc
+      real*8 a(0 : n - 1), w(0 : *), xi
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. 4 * nc) then
+          nc = n / 4
+          call makect(nc, ip, w(nw))
+      end if
+      if (isgn .ge. 0) then
+          if (n .gt. 4) then
+              call cftfsub(n, a, ip, nw, w)
+              call rftfsub(n, a, nc, w(nw))
+          else if (n .eq. 4) then
+              call cftfsub(n, a, ip, nw, w)
+          end if
+          xi = a(0) - a(1)
+          a(0) = a(0) + a(1)
+          a(1) = xi
+      else
+          a(1) = 0.5d0 * (a(0) - a(1))
+          a(0) = a(0) - a(1)
+          if (n .gt. 4) then
+              call rftbsub(n, a, nc, w(nw))
+              call cftbsub(n, a, ip, nw, w)
+          else if (n .eq. 4) then
+              call cftbsub(n, a, ip, nw, w)
+          end if
+      end if
+      end
+!
+      subroutine ddct(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *), j, nw, nc
+      real*8 a(0 : n - 1), w(0 : *), xr
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. nc) then
+          nc = n
+          call makect(nc, ip, w(nw))
+      end if
+      if (isgn .lt. 0) then
+          xr = a(n - 1)
+          do j = n - 2, 2, -2
+              a(j + 1) = a(j) - a(j - 1)
+              a(j) = a(j) + a(j - 1)
+          end do
+          a(1) = a(0) - xr
+          a(0) = a(0) + xr
+          if (n .gt. 4) then
+              call rftbsub(n, a, nc, w(nw))
+              call cftbsub(n, a, ip, nw, w)
+          else if (n .eq. 4) then
+              call cftbsub(n, a, ip, nw, w)
+          end if
+      end if
+      call dctsub(n, a, nc, w(nw))
+      if (isgn .ge. 0) then
+          if (n .gt. 4) then
+              call cftfsub(n, a, ip, nw, w)
+              call rftfsub(n, a, nc, w(nw))
+          else if (n .eq. 4) then
+              call cftfsub(n, a, ip, nw, w)
+          end if
+          xr = a(0) - a(1)
+          a(0) = a(0) + a(1)
+          do j = 2, n - 2, 2
+              a(j - 1) = a(j) - a(j + 1)
+              a(j) = a(j) + a(j + 1)
+          end do
+          a(n - 1) = xr
+      end if
+      end
+!
+      subroutine ddst(n, isgn, a, ip, w)
+      integer n, isgn, ip(0 : *), j, nw, nc
+      real*8 a(0 : n - 1), w(0 : *), xr
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. nc) then
+          nc = n
+          call makect(nc, ip, w(nw))
+      end if
+      if (isgn .lt. 0) then
+          xr = a(n - 1)
+          do j = n - 2, 2, -2
+              a(j + 1) = -a(j) - a(j - 1)
+              a(j) = a(j) - a(j - 1)
+          end do
+          a(1) = a(0) + xr
+          a(0) = a(0) - xr
+          if (n .gt. 4) then
+              call rftbsub(n, a, nc, w(nw))
+              call cftbsub(n, a, ip, nw, w)
+          else if (n .eq. 4) then
+              call cftbsub(n, a, ip, nw, w)
+          end if
+      end if
+      call dstsub(n, a, nc, w(nw))
+      if (isgn .ge. 0) then
+          if (n .gt. 4) then
+              call cftfsub(n, a, ip, nw, w)
+              call rftfsub(n, a, nc, w(nw))
+          else if (n .eq. 4) then
+              call cftfsub(n, a, ip, nw, w)
+          end if
+          xr = a(0) - a(1)
+          a(0) = a(0) + a(1)
+          do j = 2, n - 2, 2
+              a(j - 1) = -a(j) - a(j + 1)
+              a(j) = a(j) - a(j + 1)
+          end do
+          a(n - 1) = -xr
+      end if
+      end
+!
+      subroutine dfct(n, a, t, ip, w)
+      integer n, ip(0 : *), j, k, l, m, mh, nw, nc
+      real*8 a(0 : n), t(0 : n / 2), w(0 : *), xr, xi, yr, yi
+      nw = ip(0)
+      if (n .gt. 8 * nw) then
+          nw = n / 8
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. 2 * nc) then
+          nc = n / 2
+          call makect(nc, ip, w(nw))
+      end if
+      m = n / 2
+      yi = a(m)
+      xi = a(0) + a(n)
+      a(0) = a(0) - a(n)
+      t(0) = xi - yi
+      t(m) = xi + yi
+      if (n .gt. 2) then
+          mh = m / 2
+          do j = 1, mh - 1
+              k = m - j
+              xr = a(j) - a(n - j)
+              xi = a(j) + a(n - j)
+              yr = a(k) - a(n - k)
+              yi = a(k) + a(n - k)
+              a(j) = xr
+              a(k) = yr
+              t(j) = xi - yi
+              t(k) = xi + yi
+          end do
+          t(mh) = a(mh) + a(n - mh)
+          a(mh) = a(mh) - a(n - mh)
+          call dctsub(m, a, nc, w(nw))
+          if (m .gt. 4) then
+              call cftfsub(m, a, ip, nw, w)
+              call rftfsub(m, a, nc, w(nw))
+          else if (m .eq. 4) then
+              call cftfsub(m, a, ip, nw, w)
+          end if
+          a(n - 1) = a(0) - a(1)
+          a(1) = a(0) + a(1)
+          do j = m - 2, 2, -2
+              a(2 * j + 1) = a(j) + a(j + 1)
+              a(2 * j - 1) = a(j) - a(j + 1)
+          end do
+          l = 2
+          m = mh
+          do while (m .ge. 2)
+              call dctsub(m, t, nc, w(nw))
+              if (m .gt. 4) then
+                  call cftfsub(m, t, ip, nw, w)
+                  call rftfsub(m, t, nc, w(nw))
+              else if (m .eq. 4) then
+                  call cftfsub(m, t, ip, nw, w)
+              end if
+              a(n - l) = t(0) - t(1)
+              a(l) = t(0) + t(1)
+              k = 0
+              do j = 2, m - 2, 2
+                  k = k + 4 * l
+                  a(k - l) = t(j) - t(j + 1)
+                  a(k + l) = t(j) + t(j + 1)
+              end do
+              l = 2 * l
+              mh = m / 2
+              do j = 0, mh - 1
+                  k = m - j
+                  t(j) = t(m + k) - t(m + j)
+                  t(k) = t(m + k) + t(m + j)
+              end do
+              t(mh) = t(m + mh)
+              m = mh
+          end do
+          a(l) = t(0)
+          a(n) = t(2) - t(1)
+          a(0) = t(2) + t(1)
+      else
+          a(1) = a(0)
+          a(2) = t(0)
+          a(0) = t(1)
+      end if
+      end
+!
+      subroutine dfst(n, a, t, ip, w)
+      integer n, ip(0 : *), j, k, l, m, mh, nw, nc
+      real*8 a(0 : n - 1), t(0 : n / 2 - 1), w(0 : *), xr, xi, yr, yi
+      nw = ip(0)
+      if (n .gt. 8 * nw) then
+          nw = n / 8
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. 2 * nc) then
+          nc = n / 2
+          call makect(nc, ip, w(nw))
+      end if
+      if (n .gt. 2) then
+          m = n / 2
+          mh = m / 2
+          do j = 1, mh - 1
+              k = m - j
+              xr = a(j) + a(n - j)
+              xi = a(j) - a(n - j)
+              yr = a(k) + a(n - k)
+              yi = a(k) - a(n - k)
+              a(j) = xr
+              a(k) = yr
+              t(j) = xi + yi
+              t(k) = xi - yi
+          end do
+          t(0) = a(mh) - a(n - mh)
+          a(mh) = a(mh) + a(n - mh)
+          a(0) = a(m)
+          call dstsub(m, a, nc, w(nw))
+          if (m .gt. 4) then
+              call cftfsub(m, a, ip, nw, w)
+              call rftfsub(m, a, nc, w(nw))
+          else if (m .eq. 4) then
+              call cftfsub(m, a, ip, nw, w)
+          end if
+          a(n - 1) = a(1) - a(0)
+          a(1) = a(0) + a(1)
+          do j = m - 2, 2, -2
+              a(2 * j + 1) = a(j) - a(j + 1)
+              a(2 * j - 1) = -a(j) - a(j + 1)
+          end do
+          l = 2
+          m = mh
+          do while (m .ge. 2)
+              call dstsub(m, t, nc, w(nw))
+              if (m .gt. 4) then
+                  call cftfsub(m, t, ip, nw, w)
+                  call rftfsub(m, t, nc, w(nw))
+              else if (m .eq. 4) then
+                  call cftfsub(m, t, ip, nw, w)
+              end if
+              a(n - l) = t(1) - t(0)
+              a(l) = t(0) + t(1)
+              k = 0
+              do j = 2, m - 2, 2
+                  k = k + 4 * l
+                  a(k - l) = -t(j) - t(j + 1)
+                  a(k + l) = t(j) - t(j + 1)
+              end do
+              l = 2 * l
+              mh = m / 2
+              do j = 1, mh - 1
+                  k = m - j
+                  t(j) = t(m + k) + t(m + j)
+                  t(k) = t(m + k) - t(m + j)
+              end do
+              t(0) = t(m + mh)
+              m = mh
+          end do
+          a(l) = t(0)
+      end if
+      a(0) = 0
+      end
+!
+! -------- initializing routines --------
+!
+      subroutine makewt(nw, ip, w)
+      integer nw, ip(0 : *), j, nwh, nw0, nw1
+      real*8 w(0 : nw - 1), delta, wn4r, wk1r, wk1i, wk3r, wk3i
+      ip(0) = nw
+      ip(1) = 1
+      if (nw .gt. 2) then
+          nwh = nw / 2
+          delta = atan(1.0d0) / nwh
+          wn4r = cos(delta * nwh)
+          w(0) = 1
+          w(1) = wn4r
+          if (nwh .eq. 4) then
+              w(2) = cos(delta * 2)
+              w(3) = sin(delta * 2)
+          else if (nwh .gt. 4) then
+              call makeipt(nw, ip)
+              w(2) = 0.5d0 / cos(delta * 2)
+              w(3) = 0.5d0 / cos(delta * 6)
+              do j = 4, nwh - 4, 4
+                  w(j) = cos(delta * j)
+                  w(j + 1) = sin(delta * j)
+                  w(j + 2) = cos(3 * delta * j)
+                  w(j + 3) = -sin(3 * delta * j)
+              end do
+          end if
+          nw0 = 0
+          do while (nwh .gt. 2)
+              nw1 = nw0 + nwh
+              nwh = nwh / 2
+              w(nw1) = 1
+              w(nw1 + 1) = wn4r
+              if (nwh .eq. 4) then
+                  wk1r = w(nw0 + 4)
+                  wk1i = w(nw0 + 5)
+                  w(nw1 + 2) = wk1r
+                  w(nw1 + 3) = wk1i
+              else if (nwh .gt. 4) then
+                  wk1r = w(nw0 + 4)
+                  wk3r = w(nw0 + 6)
+                  w(nw1 + 2) = 0.5d0 / wk1r
+                  w(nw1 + 3) = 0.5d0 / wk3r
+                  do j = 4, nwh - 4, 4
+                      wk1r = w(nw0 + 2 * j)
+                      wk1i = w(nw0 + 2 * j + 1)
+                      wk3r = w(nw0 + 2 * j + 2)
+                      wk3i = w(nw0 + 2 * j + 3)
+                      w(nw1 + j) = wk1r
+                      w(nw1 + j + 1) = wk1i
+                      w(nw1 + j + 2) = wk3r
+                      w(nw1 + j + 3) = wk3i
+                  end do
+              end if
+              nw0 = nw1
+          end do
+      end if
+      end
+!
+      subroutine makeipt(nw, ip)
+      integer nw, ip(0 : *), j, l, m, m2, p, q
+      ip(2) = 0
+      ip(3) = 16
+      m = 2
+      l = nw
+      do while (l .gt. 32)
+          m2 = 2 * m
+          q = 8 * m2
+          do j = m, m2 - 1
+              p = 4 * ip(j)
+              ip(m + j) = p
+              ip(m2 + j) = p + q
+          end do
+          m = m2
+          l = l / 4
+      end do
+      end
+!
+      subroutine makect(nc, ip, c)
+      integer nc, ip(0 : *), j, nch
+      real*8 c(0 : nc - 1), delta
+      ip(1) = nc
+      if (nc .gt. 1) then
+          nch = nc / 2
+          delta = atan(1.0d0) / nch
+          c(0) = cos(delta * nch)
+          c(nch) = 0.5d0 * c(0)
+          do j = 1, nch - 1
+              c(j) = 0.5d0 * cos(delta * j)
+              c(nc - j) = 0.5d0 * sin(delta * j)
+          end do
+      end if
+      end
+!
+! -------- child routines --------
+!
+      subroutine cftfsub(n, a, ip, nw, w)
+      integer n, ip(0 : *), nw
+      real*8 a(0 : n - 1), w(0 : nw - 1)
+      if (n .gt. 8) then
+          if (n .gt. 32) then
+              call cftf1st(n, a, w(nw - n / 4))
+              if (n .gt. 512) then
+                  call cftrec4(n, a, nw, w)
+              else if (n .gt. 128) then
+                  call cftleaf(n, 1, a, nw, w)
+              else
+                  call cftfx41(n, a, nw, w)
+              end if
+              call bitrv2(n, ip, a)
+          else if (n .eq. 32) then
+              call cftf161(a, w(nw - 8))
+              call bitrv216(a)
+          else
+              call cftf081(a, w)
+              call bitrv208(a)
+          end if
+      else if (n .eq. 8) then
+          call cftf040(a)
+      else if (n .eq. 4) then
+          call cftx020(a)
+      end if
+      end
+!
+      subroutine cftbsub(n, a, ip, nw, w)
+      integer n, ip(0 : *), nw
+      real*8 a(0 : n - 1), w(0 : nw - 1)
+      if (n .gt. 8) then
+          if (n .gt. 32) then
+              call cftb1st(n, a, w(nw - n / 4))
+              if (n .gt. 512) then
+                  call cftrec4(n, a, nw, w)
+              else if (n .gt. 128) then
+                  call cftleaf(n, 1, a, nw, w)
+              else
+                  call cftfx41(n, a, nw, w)
+              end if
+              call bitrv2conj(n, ip, a)
+          else if (n .eq. 32) then
+              call cftf161(a, w(nw - 8))
+              call bitrv216neg(a)
+          else
+              call cftf081(a, w)
+              call bitrv208neg(a)
+          end if
+      else if (n .eq. 8) then
+          call cftb040(a)
+      else if (n .eq. 4) then
+          call cftx020(a)
+      end if
+      end
+!
+      subroutine bitrv2(n, ip, a)
+      integer n, ip(0 : *), j, j1, k, k1, l, m, nh, nm
+      real*8 a(0 : n - 1), xr, xi, yr, yi
+      m = 1
+      l = n / 4
+      do while (l .gt. 8)
+          m = m * 2
+          l = l / 4
+      end do
+      nh = n / 2
+      nm = 4 * m
+      if (l .eq. 8) then
+          do k = 0, m - 1
+              do j = 0, k - 1
+                  j1 = 4 * j + 2 * ip(m + k)
+                  k1 = 4 * k + 2 * ip(m + j)
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + 2 * nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 - nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + 2 * nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nh
+                  k1 = k1 + 2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - 2 * nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 + nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - 2 * nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + 2
+                  k1 = k1 + nh
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + 2 * nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 - nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + 2 * nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nh
+                  k1 = k1 - 2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - 2 * nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 + nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - 2 * nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+              k1 = 4 * k + 2 * ip(m + k)
+              j1 = k1 + 2
+              k1 = k1 + nh
+              xr = a(j1)
+              xi = a(j1 + 1)
+              yr = a(k1)
+              yi = a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              j1 = j1 + nm
+              k1 = k1 + 2 * nm
+              xr = a(j1)
+              xi = a(j1 + 1)
+              yr = a(k1)
+              yi = a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              j1 = j1 + nm
+              k1 = k1 - nm
+              xr = a(j1)
+              xi = a(j1 + 1)
+              yr = a(k1)
+              yi = a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              j1 = j1 - 2
+              k1 = k1 - nh
+              xr = a(j1)
+              xi = a(j1 + 1)
+              yr = a(k1)
+              yi = a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              j1 = j1 + nh + 2
+              k1 = k1 + nh + 2
+              xr = a(j1)
+              xi = a(j1 + 1)
+              yr = a(k1)
+              yi = a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              j1 = j1 - nh + nm
+              k1 = k1 + 2 * nm - 2
+              xr = a(j1)
+              xi = a(j1 + 1)
+              yr = a(k1)
+              yi = a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+          end do
+      else
+          do k = 0, m - 1
+              do j = 0, k - 1
+                  j1 = 4 * j + ip(m + k)
+                  k1 = 4 * k + ip(m + j)
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nh
+                  k1 = k1 + 2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + 2
+                  k1 = k1 + nh
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nh
+                  k1 = k1 - 2
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - nm
+                  xr = a(j1)
+                  xi = a(j1 + 1)
+                  yr = a(k1)
+                  yi = a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+              k1 = 4 * k + ip(m + k)
+              j1 = k1 + 2
+              k1 = k1 + nh
+              xr = a(j1)
+              xi = a(j1 + 1)
+              yr = a(k1)
+              yi = a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              j1 = j1 + nm
+              k1 = k1 + nm
+              xr = a(j1)
+              xi = a(j1 + 1)
+              yr = a(k1)
+              yi = a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+          end do
+      end if
+      end
+!
+      subroutine bitrv2conj(n, ip, a)
+      integer n, ip(0 : *), j, j1, k, k1, l, m, nh, nm
+      real*8 a(0 : n - 1), xr, xi, yr, yi
+      m = 1
+      l = n / 4
+      do while (l .gt. 8)
+          m = m * 2
+          l = l / 4
+      end do
+      nh = n / 2
+      nm = 4 * m
+      if (l .eq. 8) then
+          do k = 0, m - 1
+              do j = 0, k - 1
+                  j1 = 4 * j + 2 * ip(m + k)
+                  k1 = 4 * k + 2 * ip(m + j)
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + 2 * nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 - nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + 2 * nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nh
+                  k1 = k1 + 2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - 2 * nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 + nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - 2 * nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + 2
+                  k1 = k1 + nh
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + 2 * nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 - nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + 2 * nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nh
+                  k1 = k1 - 2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - 2 * nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 + nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - 2 * nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+              k1 = 4 * k + 2 * ip(m + k)
+              j1 = k1 + 2
+              k1 = k1 + nh
+              a(j1 - 1) = -a(j1 - 1)
+              xr = a(j1)
+              xi = -a(j1 + 1)
+              yr = a(k1)
+              yi = -a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              a(k1 + 3) = -a(k1 + 3)
+              j1 = j1 + nm
+              k1 = k1 + 2 * nm
+              xr = a(j1)
+              xi = -a(j1 + 1)
+              yr = a(k1)
+              yi = -a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              j1 = j1 + nm
+              k1 = k1 - nm
+              xr = a(j1)
+              xi = -a(j1 + 1)
+              yr = a(k1)
+              yi = -a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              j1 = j1 - 2
+              k1 = k1 - nh
+              xr = a(j1)
+              xi = -a(j1 + 1)
+              yr = a(k1)
+              yi = -a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              j1 = j1 + nh + 2
+              k1 = k1 + nh + 2
+              xr = a(j1)
+              xi = -a(j1 + 1)
+              yr = a(k1)
+              yi = -a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              j1 = j1 - nh + nm
+              k1 = k1 + 2 * nm - 2
+              a(j1 - 1) = -a(j1 - 1)
+              xr = a(j1)
+              xi = -a(j1 + 1)
+              yr = a(k1)
+              yi = -a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              a(k1 + 3) = -a(k1 + 3)
+          end do
+      else
+          do k = 0, m - 1
+              do j = 0, k - 1
+                  j1 = 4 * j + ip(m + k)
+                  k1 = 4 * k + ip(m + j)
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nh
+                  k1 = k1 + 2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + 2
+                  k1 = k1 + nh
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 + nm
+                  k1 = k1 + nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nh
+                  k1 = k1 - 2
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+                  j1 = j1 - nm
+                  k1 = k1 - nm
+                  xr = a(j1)
+                  xi = -a(j1 + 1)
+                  yr = a(k1)
+                  yi = -a(k1 + 1)
+                  a(j1) = yr
+                  a(j1 + 1) = yi
+                  a(k1) = xr
+                  a(k1 + 1) = xi
+              end do
+              k1 = 4 * k + ip(m + k)
+              j1 = k1 + 2
+              k1 = k1 + nh
+              a(j1 - 1) = -a(j1 - 1)
+              xr = a(j1)
+              xi = -a(j1 + 1)
+              yr = a(k1)
+              yi = -a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              a(k1 + 3) = -a(k1 + 3)
+              j1 = j1 + nm
+              k1 = k1 + nm
+              a(j1 - 1) = -a(j1 - 1)
+              xr = a(j1)
+              xi = -a(j1 + 1)
+              yr = a(k1)
+              yi = -a(k1 + 1)
+              a(j1) = yr
+              a(j1 + 1) = yi
+              a(k1) = xr
+              a(k1 + 1) = xi
+              a(k1 + 3) = -a(k1 + 3)
+          end do
+      end if
+      end
+!
+      subroutine bitrv216(a)
+      real*8 a(0 : 31), x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i
+      real*8 x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i
+      real*8 x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i
+      x1r = a(2)
+      x1i = a(3)
+      x2r = a(4)
+      x2i = a(5)
+      x3r = a(6)
+      x3i = a(7)
+      x4r = a(8)
+      x4i = a(9)
+      x5r = a(10)
+      x5i = a(11)
+      x7r = a(14)
+      x7i = a(15)
+      x8r = a(16)
+      x8i = a(17)
+      x10r = a(20)
+      x10i = a(21)
+      x11r = a(22)
+      x11i = a(23)
+      x12r = a(24)
+      x12i = a(25)
+      x13r = a(26)
+      x13i = a(27)
+      x14r = a(28)
+      x14i = a(29)
+      a(2) = x8r
+      a(3) = x8i
+      a(4) = x4r
+      a(5) = x4i
+      a(6) = x12r
+      a(7) = x12i
+      a(8) = x2r
+      a(9) = x2i
+      a(10) = x10r
+      a(11) = x10i
+      a(14) = x14r
+      a(15) = x14i
+      a(16) = x1r
+      a(17) = x1i
+      a(20) = x5r
+      a(21) = x5i
+      a(22) = x13r
+      a(23) = x13i
+      a(24) = x3r
+      a(25) = x3i
+      a(26) = x11r
+      a(27) = x11i
+      a(28) = x7r
+      a(29) = x7i
+      end
+!
+      subroutine bitrv216neg(a)
+      real*8 a(0 : 31), x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i
+      real*8 x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i
+      real*8 x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i
+      real*8 x13r, x13i, x14r, x14i, x15r, x15i
+      x1r = a(2)
+      x1i = a(3)
+      x2r = a(4)
+      x2i = a(5)
+      x3r = a(6)
+      x3i = a(7)
+      x4r = a(8)
+      x4i = a(9)
+      x5r = a(10)
+      x5i = a(11)
+      x6r = a(12)
+      x6i = a(13)
+      x7r = a(14)
+      x7i = a(15)
+      x8r = a(16)
+      x8i = a(17)
+      x9r = a(18)
+      x9i = a(19)
+      x10r = a(20)
+      x10i = a(21)
+      x11r = a(22)
+      x11i = a(23)
+      x12r = a(24)
+      x12i = a(25)
+      x13r = a(26)
+      x13i = a(27)
+      x14r = a(28)
+      x14i = a(29)
+      x15r = a(30)
+      x15i = a(31)
+      a(2) = x15r
+      a(3) = x15i
+      a(4) = x7r
+      a(5) = x7i
+      a(6) = x11r
+      a(7) = x11i
+      a(8) = x3r
+      a(9) = x3i
+      a(10) = x13r
+      a(11) = x13i
+      a(12) = x5r
+      a(13) = x5i
+      a(14) = x9r
+      a(15) = x9i
+      a(16) = x1r
+      a(17) = x1i
+      a(18) = x14r
+      a(19) = x14i
+      a(20) = x6r
+      a(21) = x6i
+      a(22) = x10r
+      a(23) = x10i
+      a(24) = x2r
+      a(25) = x2i
+      a(26) = x12r
+      a(27) = x12i
+      a(28) = x4r
+      a(29) = x4i
+      a(30) = x8r
+      a(31) = x8i
+      end
+!
+      subroutine bitrv208(a)
+      real*8 a(0 : 15), x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i
+      x1r = a(2)
+      x1i = a(3)
+      x3r = a(6)
+      x3i = a(7)
+      x4r = a(8)
+      x4i = a(9)
+      x6r = a(12)
+      x6i = a(13)
+      a(2) = x4r
+      a(3) = x4i
+      a(6) = x6r
+      a(7) = x6i
+      a(8) = x1r
+      a(9) = x1i
+      a(12) = x3r
+      a(13) = x3i
+      end
+!
+      subroutine bitrv208neg(a)
+      real*8 a(0 : 15), x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i
+      real*8 x5r, x5i, x6r, x6i, x7r, x7i
+      x1r = a(2)
+      x1i = a(3)
+      x2r = a(4)
+      x2i = a(5)
+      x3r = a(6)
+      x3i = a(7)
+      x4r = a(8)
+      x4i = a(9)
+      x5r = a(10)
+      x5i = a(11)
+      x6r = a(12)
+      x6i = a(13)
+      x7r = a(14)
+      x7i = a(15)
+      a(2) = x7r
+      a(3) = x7i
+      a(4) = x3r
+      a(5) = x3i
+      a(6) = x5r
+      a(7) = x5i
+      a(8) = x1r
+      a(9) = x1i
+      a(10) = x6r
+      a(11) = x6i
+      a(12) = x2r
+      a(13) = x2i
+      a(14) = x4r
+      a(15) = x4i
+      end
+!
+      subroutine cftf1st(n, a, w)
+      integer n, j, j0, j1, j2, j3, k, m, mh
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i
+      real*8 wd1r, wd1i, wd3r, wd3i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+      mh = n / 8
+      m = 2 * mh
+      j1 = m
+      j2 = j1 + m
+      j3 = j2 + m
+      x0r = a(0) + a(j2)
+      x0i = a(1) + a(j2 + 1)
+      x1r = a(0) - a(j2)
+      x1i = a(1) - a(j2 + 1)
+      x2r = a(j1) + a(j3)
+      x2i = a(j1 + 1) + a(j3 + 1)
+      x3r = a(j1) - a(j3)
+      x3i = a(j1 + 1) - a(j3 + 1)
+      a(0) = x0r + x2r
+      a(1) = x0i + x2i
+      a(j1) = x0r - x2r
+      a(j1 + 1) = x0i - x2i
+      a(j2) = x1r - x3i
+      a(j2 + 1) = x1i + x3r
+      a(j3) = x1r + x3i
+      a(j3 + 1) = x1i - x3r
+      wn4r = w(1)
+      csc1 = w(2)
+      csc3 = w(3)
+      wd1r = 1
+      wd1i = 0
+      wd3r = 1
+      wd3i = 0
+      k = 0
+      do j = 2, mh - 6, 4
+          k = k + 4
+          wk1r = csc1 * (wd1r + w(k))
+          wk1i = csc1 * (wd1i + w(k + 1))
+          wk3r = csc3 * (wd3r + w(k + 2))
+          wk3i = csc3 * (wd3i + w(k + 3))
+          wd1r = w(k)
+          wd1i = w(k + 1)
+          wd3r = w(k + 2)
+          wd3i = w(k + 3)
+          j1 = j + m
+          j2 = j1 + m
+          j3 = j2 + m
+          x0r = a(j) + a(j2)
+          x0i = a(j + 1) + a(j2 + 1)
+          x1r = a(j) - a(j2)
+          x1i = a(j + 1) - a(j2 + 1)
+          y0r = a(j + 2) + a(j2 + 2)
+          y0i = a(j + 3) + a(j2 + 3)
+          y1r = a(j + 2) - a(j2 + 2)
+          y1i = a(j + 3) - a(j2 + 3)
+          x2r = a(j1) + a(j3)
+          x2i = a(j1 + 1) + a(j3 + 1)
+          x3r = a(j1) - a(j3)
+          x3i = a(j1 + 1) - a(j3 + 1)
+          y2r = a(j1 + 2) + a(j3 + 2)
+          y2i = a(j1 + 3) + a(j3 + 3)
+          y3r = a(j1 + 2) - a(j3 + 2)
+          y3i = a(j1 + 3) - a(j3 + 3)
+          a(j) = x0r + x2r
+          a(j + 1) = x0i + x2i
+          a(j + 2) = y0r + y2r
+          a(j + 3) = y0i + y2i
+          a(j1) = x0r - x2r
+          a(j1 + 1) = x0i - x2i
+          a(j1 + 2) = y0r - y2r
+          a(j1 + 3) = y0i - y2i
+          x0r = x1r - x3i
+          x0i = x1i + x3r
+          a(j2) = wk1r * x0r - wk1i * x0i
+          a(j2 + 1) = wk1r * x0i + wk1i * x0r
+          x0r = y1r - y3i
+          x0i = y1i + y3r
+          a(j2 + 2) = wd1r * x0r - wd1i * x0i
+          a(j2 + 3) = wd1r * x0i + wd1i * x0r
+          x0r = x1r + x3i
+          x0i = x1i - x3r
+          a(j3) = wk3r * x0r + wk3i * x0i
+          a(j3 + 1) = wk3r * x0i - wk3i * x0r
+          x0r = y1r + y3i
+          x0i = y1i - y3r
+          a(j3 + 2) = wd3r * x0r + wd3i * x0i
+          a(j3 + 3) = wd3r * x0i - wd3i * x0r
+          j0 = m - j
+          j1 = j0 + m
+          j2 = j1 + m
+          j3 = j2 + m
+          x0r = a(j0) + a(j2)
+          x0i = a(j0 + 1) + a(j2 + 1)
+          x1r = a(j0) - a(j2)
+          x1i = a(j0 + 1) - a(j2 + 1)
+          y0r = a(j0 - 2) + a(j2 - 2)
+          y0i = a(j0 - 1) + a(j2 - 1)
+          y1r = a(j0 - 2) - a(j2 - 2)
+          y1i = a(j0 - 1) - a(j2 - 1)
+          x2r = a(j1) + a(j3)
+          x2i = a(j1 + 1) + a(j3 + 1)
+          x3r = a(j1) - a(j3)
+          x3i = a(j1 + 1) - a(j3 + 1)
+          y2r = a(j1 - 2) + a(j3 - 2)
+          y2i = a(j1 - 1) + a(j3 - 1)
+          y3r = a(j1 - 2) - a(j3 - 2)
+          y3i = a(j1 - 1) - a(j3 - 1)
+          a(j0) = x0r + x2r
+          a(j0 + 1) = x0i + x2i
+          a(j0 - 2) = y0r + y2r
+          a(j0 - 1) = y0i + y2i
+          a(j1) = x0r - x2r
+          a(j1 + 1) = x0i - x2i
+          a(j1 - 2) = y0r - y2r
+          a(j1 - 1) = y0i - y2i
+          x0r = x1r - x3i
+          x0i = x1i + x3r
+          a(j2) = wk1i * x0r - wk1r * x0i
+          a(j2 + 1) = wk1i * x0i + wk1r * x0r
+          x0r = y1r - y3i
+          x0i = y1i + y3r
+          a(j2 - 2) = wd1i * x0r - wd1r * x0i
+          a(j2 - 1) = wd1i * x0i + wd1r * x0r
+          x0r = x1r + x3i
+          x0i = x1i - x3r
+          a(j3) = wk3i * x0r + wk3r * x0i
+          a(j3 + 1) = wk3i * x0i - wk3r * x0r
+          x0r = y1r + y3i
+          x0i = y1i - y3r
+          a(j3 - 2) = wd3i * x0r + wd3r * x0i
+          a(j3 - 1) = wd3i * x0i - wd3r * x0r
+      end do
+      wk1r = csc1 * (wd1r + wn4r)
+      wk1i = csc1 * (wd1i + wn4r)
+      wk3r = csc3 * (wd3r - wn4r)
+      wk3i = csc3 * (wd3i - wn4r)
+      j0 = mh
+      j1 = j0 + m
+      j2 = j1 + m
+      j3 = j2 + m
+      x0r = a(j0 - 2) + a(j2 - 2)
+      x0i = a(j0 - 1) + a(j2 - 1)
+      x1r = a(j0 - 2) - a(j2 - 2)
+      x1i = a(j0 - 1) - a(j2 - 1)
+      x2r = a(j1 - 2) + a(j3 - 2)
+      x2i = a(j1 - 1) + a(j3 - 1)
+      x3r = a(j1 - 2) - a(j3 - 2)
+      x3i = a(j1 - 1) - a(j3 - 1)
+      a(j0 - 2) = x0r + x2r
+      a(j0 - 1) = x0i + x2i
+      a(j1 - 2) = x0r - x2r
+      a(j1 - 1) = x0i - x2i
+      x0r = x1r - x3i
+      x0i = x1i + x3r
+      a(j2 - 2) = wk1r * x0r - wk1i * x0i
+      a(j2 - 1) = wk1r * x0i + wk1i * x0r
+      x0r = x1r + x3i
+      x0i = x1i - x3r
+      a(j3 - 2) = wk3r * x0r + wk3i * x0i
+      a(j3 - 1) = wk3r * x0i - wk3i * x0r
+      x0r = a(j0) + a(j2)
+      x0i = a(j0 + 1) + a(j2 + 1)
+      x1r = a(j0) - a(j2)
+      x1i = a(j0 + 1) - a(j2 + 1)
+      x2r = a(j1) + a(j3)
+      x2i = a(j1 + 1) + a(j3 + 1)
+      x3r = a(j1) - a(j3)
+      x3i = a(j1 + 1) - a(j3 + 1)
+      a(j0) = x0r + x2r
+      a(j0 + 1) = x0i + x2i
+      a(j1) = x0r - x2r
+      a(j1 + 1) = x0i - x2i
+      x0r = x1r - x3i
+      x0i = x1i + x3r
+      a(j2) = wn4r * (x0r - x0i)
+      a(j2 + 1) = wn4r * (x0i + x0r)
+      x0r = x1r + x3i
+      x0i = x1i - x3r
+      a(j3) = -wn4r * (x0r + x0i)
+      a(j3 + 1) = -wn4r * (x0i - x0r)
+      x0r = a(j0 + 2) + a(j2 + 2)
+      x0i = a(j0 + 3) + a(j2 + 3)
+      x1r = a(j0 + 2) - a(j2 + 2)
+      x1i = a(j0 + 3) - a(j2 + 3)
+      x2r = a(j1 + 2) + a(j3 + 2)
+      x2i = a(j1 + 3) + a(j3 + 3)
+      x3r = a(j1 + 2) - a(j3 + 2)
+      x3i = a(j1 + 3) - a(j3 + 3)
+      a(j0 + 2) = x0r + x2r
+      a(j0 + 3) = x0i + x2i
+      a(j1 + 2) = x0r - x2r
+      a(j1 + 3) = x0i - x2i
+      x0r = x1r - x3i
+      x0i = x1i + x3r
+      a(j2 + 2) = wk1i * x0r - wk1r * x0i
+      a(j2 + 3) = wk1i * x0i + wk1r * x0r
+      x0r = x1r + x3i
+      x0i = x1i - x3r
+      a(j3 + 2) = wk3i * x0r + wk3r * x0i
+      a(j3 + 3) = wk3i * x0i - wk3r * x0r
+      end
+!
+      subroutine cftb1st(n, a, w)
+      integer n, j, j0, j1, j2, j3, k, m, mh
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i
+      real*8 wd1r, wd1i, wd3r, wd3i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+      mh = n / 8
+      m = 2 * mh
+      j1 = m
+      j2 = j1 + m
+      j3 = j2 + m
+      x0r = a(0) + a(j2)
+      x0i = -a(1) - a(j2 + 1)
+      x1r = a(0) - a(j2)
+      x1i = -a(1) + a(j2 + 1)
+      x2r = a(j1) + a(j3)
+      x2i = a(j1 + 1) + a(j3 + 1)
+      x3r = a(j1) - a(j3)
+      x3i = a(j1 + 1) - a(j3 + 1)
+      a(0) = x0r + x2r
+      a(1) = x0i - x2i
+      a(j1) = x0r - x2r
+      a(j1 + 1) = x0i + x2i
+      a(j2) = x1r + x3i
+      a(j2 + 1) = x1i + x3r
+      a(j3) = x1r - x3i
+      a(j3 + 1) = x1i - x3r
+      wn4r = w(1)
+      csc1 = w(2)
+      csc3 = w(3)
+      wd1r = 1
+      wd1i = 0
+      wd3r = 1
+      wd3i = 0
+      k = 0
+      do j = 2, mh - 6, 4
+          k = k + 4
+          wk1r = csc1 * (wd1r + w(k))
+          wk1i = csc1 * (wd1i + w(k + 1))
+          wk3r = csc3 * (wd3r + w(k + 2))
+          wk3i = csc3 * (wd3i + w(k + 3))
+          wd1r = w(k)
+          wd1i = w(k + 1)
+          wd3r = w(k + 2)
+          wd3i = w(k + 3)
+          j1 = j + m
+          j2 = j1 + m
+          j3 = j2 + m
+          x0r = a(j) + a(j2)
+          x0i = -a(j + 1) - a(j2 + 1)
+          x1r = a(j) - a(j2)
+          x1i = -a(j + 1) + a(j2 + 1)
+          y0r = a(j + 2) + a(j2 + 2)
+          y0i = -a(j + 3) - a(j2 + 3)
+          y1r = a(j + 2) - a(j2 + 2)
+          y1i = -a(j + 3) + a(j2 + 3)
+          x2r = a(j1) + a(j3)
+          x2i = a(j1 + 1) + a(j3 + 1)
+          x3r = a(j1) - a(j3)
+          x3i = a(j1 + 1) - a(j3 + 1)
+          y2r = a(j1 + 2) + a(j3 + 2)
+          y2i = a(j1 + 3) + a(j3 + 3)
+          y3r = a(j1 + 2) - a(j3 + 2)
+          y3i = a(j1 + 3) - a(j3 + 3)
+          a(j) = x0r + x2r
+          a(j + 1) = x0i - x2i
+          a(j + 2) = y0r + y2r
+          a(j + 3) = y0i - y2i
+          a(j1) = x0r - x2r
+          a(j1 + 1) = x0i + x2i
+          a(j1 + 2) = y0r - y2r
+          a(j1 + 3) = y0i + y2i
+          x0r = x1r + x3i
+          x0i = x1i + x3r
+          a(j2) = wk1r * x0r - wk1i * x0i
+          a(j2 + 1) = wk1r * x0i + wk1i * x0r
+          x0r = y1r + y3i
+          x0i = y1i + y3r
+          a(j2 + 2) = wd1r * x0r - wd1i * x0i
+          a(j2 + 3) = wd1r * x0i + wd1i * x0r
+          x0r = x1r - x3i
+          x0i = x1i - x3r
+          a(j3) = wk3r * x0r + wk3i * x0i
+          a(j3 + 1) = wk3r * x0i - wk3i * x0r
+          x0r = y1r - y3i
+          x0i = y1i - y3r
+          a(j3 + 2) = wd3r * x0r + wd3i * x0i
+          a(j3 + 3) = wd3r * x0i - wd3i * x0r
+          j0 = m - j
+          j1 = j0 + m
+          j2 = j1 + m
+          j3 = j2 + m
+          x0r = a(j0) + a(j2)
+          x0i = -a(j0 + 1) - a(j2 + 1)
+          x1r = a(j0) - a(j2)
+          x1i = -a(j0 + 1) + a(j2 + 1)
+          y0r = a(j0 - 2) + a(j2 - 2)
+          y0i = -a(j0 - 1) - a(j2 - 1)
+          y1r = a(j0 - 2) - a(j2 - 2)
+          y1i = -a(j0 - 1) + a(j2 - 1)
+          x2r = a(j1) + a(j3)
+          x2i = a(j1 + 1) + a(j3 + 1)
+          x3r = a(j1) - a(j3)
+          x3i = a(j1 + 1) - a(j3 + 1)
+          y2r = a(j1 - 2) + a(j3 - 2)
+          y2i = a(j1 - 1) + a(j3 - 1)
+          y3r = a(j1 - 2) - a(j3 - 2)
+          y3i = a(j1 - 1) - a(j3 - 1)
+          a(j0) = x0r + x2r
+          a(j0 + 1) = x0i - x2i
+          a(j0 - 2) = y0r + y2r
+          a(j0 - 1) = y0i - y2i
+          a(j1) = x0r - x2r
+          a(j1 + 1) = x0i + x2i
+          a(j1 - 2) = y0r - y2r
+          a(j1 - 1) = y0i + y2i
+          x0r = x1r + x3i
+          x0i = x1i + x3r
+          a(j2) = wk1i * x0r - wk1r * x0i
+          a(j2 + 1) = wk1i * x0i + wk1r * x0r
+          x0r = y1r + y3i
+          x0i = y1i + y3r
+          a(j2 - 2) = wd1i * x0r - wd1r * x0i
+          a(j2 - 1) = wd1i * x0i + wd1r * x0r
+          x0r = x1r - x3i
+          x0i = x1i - x3r
+          a(j3) = wk3i * x0r + wk3r * x0i
+          a(j3 + 1) = wk3i * x0i - wk3r * x0r
+          x0r = y1r - y3i
+          x0i = y1i - y3r
+          a(j3 - 2) = wd3i * x0r + wd3r * x0i
+          a(j3 - 1) = wd3i * x0i - wd3r * x0r
+      end do
+      wk1r = csc1 * (wd1r + wn4r)
+      wk1i = csc1 * (wd1i + wn4r)
+      wk3r = csc3 * (wd3r - wn4r)
+      wk3i = csc3 * (wd3i - wn4r)
+      j0 = mh
+      j1 = j0 + m
+      j2 = j1 + m
+      j3 = j2 + m
+      x0r = a(j0 - 2) + a(j2 - 2)
+      x0i = -a(j0 - 1) - a(j2 - 1)
+      x1r = a(j0 - 2) - a(j2 - 2)
+      x1i = -a(j0 - 1) + a(j2 - 1)
+      x2r = a(j1 - 2) + a(j3 - 2)
+      x2i = a(j1 - 1) + a(j3 - 1)
+      x3r = a(j1 - 2) - a(j3 - 2)
+      x3i = a(j1 - 1) - a(j3 - 1)
+      a(j0 - 2) = x0r + x2r
+      a(j0 - 1) = x0i - x2i
+      a(j1 - 2) = x0r - x2r
+      a(j1 - 1) = x0i + x2i
+      x0r = x1r + x3i
+      x0i = x1i + x3r
+      a(j2 - 2) = wk1r * x0r - wk1i * x0i
+      a(j2 - 1) = wk1r * x0i + wk1i * x0r
+      x0r = x1r - x3i
+      x0i = x1i - x3r
+      a(j3 - 2) = wk3r * x0r + wk3i * x0i
+      a(j3 - 1) = wk3r * x0i - wk3i * x0r
+      x0r = a(j0) + a(j2)
+      x0i = -a(j0 + 1) - a(j2 + 1)
+      x1r = a(j0) - a(j2)
+      x1i = -a(j0 + 1) + a(j2 + 1)
+      x2r = a(j1) + a(j3)
+      x2i = a(j1 + 1) + a(j3 + 1)
+      x3r = a(j1) - a(j3)
+      x3i = a(j1 + 1) - a(j3 + 1)
+      a(j0) = x0r + x2r
+      a(j0 + 1) = x0i - x2i
+      a(j1) = x0r - x2r
+      a(j1 + 1) = x0i + x2i
+      x0r = x1r + x3i
+      x0i = x1i + x3r
+      a(j2) = wn4r * (x0r - x0i)
+      a(j2 + 1) = wn4r * (x0i + x0r)
+      x0r = x1r - x3i
+      x0i = x1i - x3r
+      a(j3) = -wn4r * (x0r + x0i)
+      a(j3 + 1) = -wn4r * (x0i - x0r)
+      x0r = a(j0 + 2) + a(j2 + 2)
+      x0i = -a(j0 + 3) - a(j2 + 3)
+      x1r = a(j0 + 2) - a(j2 + 2)
+      x1i = -a(j0 + 3) + a(j2 + 3)
+      x2r = a(j1 + 2) + a(j3 + 2)
+      x2i = a(j1 + 3) + a(j3 + 3)
+      x3r = a(j1 + 2) - a(j3 + 2)
+      x3i = a(j1 + 3) - a(j3 + 3)
+      a(j0 + 2) = x0r + x2r
+      a(j0 + 3) = x0i - x2i
+      a(j1 + 2) = x0r - x2r
+      a(j1 + 3) = x0i + x2i
+      x0r = x1r + x3i
+      x0i = x1i + x3r
+      a(j2 + 2) = wk1i * x0r - wk1r * x0i
+      a(j2 + 3) = wk1i * x0i + wk1r * x0r
+      x0r = x1r - x3i
+      x0i = x1i - x3r
+      a(j3 + 2) = wk3i * x0r + wk3r * x0i
+      a(j3 + 3) = wk3i * x0i - wk3r * x0r
+      end
+!
+      subroutine cftrec4(n, a, nw, w)
+      integer n, nw, cfttree, isplt, j, k, m
+      real*8 a(0 : n - 1), w(0 : nw - 1)
+      m = n
+      do while (m .gt. 512)
+          m = m / 4
+          call cftmdl1(m, a(n - m), w(nw - m / 2))
+      end do
+      call cftleaf(m, 1, a(n - m), nw, w)
+      k = 0
+      do j = n - m, m, -m
+          k = k + 1
+          isplt = cfttree(m, j, k, a, nw, w)
+          call cftleaf(m, isplt, a(j - m), nw, w)
+      end do
+      end
+!
+      integer function cfttree(n, j, k, a, nw, w)
+      integer n, j, k, nw, i, isplt, m
+      real*8 a(0 : j - 1), w(0 : nw - 1)
+      if (mod(k, 4) .ne. 0) then
+          isplt = mod(k, 2)
+          if (isplt .ne. 0) then
+              call cftmdl1(n, a(j - n), w(nw - n / 2))
+          else
+              call cftmdl2(n, a(j - n), w(nw - n))
+          end if
+      else
+          m = n
+          i = k
+          do while (mod(i, 4) .eq. 0)
+              m = m * 4
+              i = i / 4
+          end do
+          isplt = mod(i, 2)
+          if (isplt .ne. 0) then
+              do while (m .gt. 128)
+                  call cftmdl1(m, a(j - m), w(nw - m / 2))
+                  m = m / 4
+              end do
+          else
+              do while (m .gt. 128)
+                  call cftmdl2(m, a(j - m), w(nw - m))
+                  m = m / 4
+              end do
+          end if
+      end if
+      cfttree = isplt
+      end
+!
+      subroutine cftleaf(n, isplt, a, nw, w)
+      integer n, isplt, nw
+      real*8 a(0 : n - 1), w(0 : nw - 1)
+      if (n .eq. 512) then
+          call cftmdl1(128, a, w(nw - 64))
+          call cftf161(a, w(nw - 8))
+          call cftf162(a(32), w(nw - 32))
+          call cftf161(a(64), w(nw - 8))
+          call cftf161(a(96), w(nw - 8))
+          call cftmdl2(128, a(128), w(nw - 128))
+          call cftf161(a(128), w(nw - 8))
+          call cftf162(a(160), w(nw - 32))
+          call cftf161(a(192), w(nw - 8))
+          call cftf162(a(224), w(nw - 32))
+          call cftmdl1(128, a(256), w(nw - 64))
+          call cftf161(a(256), w(nw - 8))
+          call cftf162(a(288), w(nw - 32))
+          call cftf161(a(320), w(nw - 8))
+          call cftf161(a(352), w(nw - 8))
+          if (isplt .ne. 0) then
+              call cftmdl1(128, a(384), w(nw - 64))
+              call cftf161(a(480), w(nw - 8))
+          else
+              call cftmdl2(128, a(384), w(nw - 128))
+              call cftf162(a(480), w(nw - 32))
+          end if
+          call cftf161(a(384), w(nw - 8))
+          call cftf162(a(416), w(nw - 32))
+          call cftf161(a(448), w(nw - 8))
+      else
+          call cftmdl1(64, a, w(nw - 32))
+          call cftf081(a, w(nw - 8))
+          call cftf082(a(16), w(nw - 8))
+          call cftf081(a(32), w(nw - 8))
+          call cftf081(a(48), w(nw - 8))
+          call cftmdl2(64, a(64), w(nw - 64))
+          call cftf081(a(64), w(nw - 8))
+          call cftf082(a(80), w(nw - 8))
+          call cftf081(a(96), w(nw - 8))
+          call cftf082(a(112), w(nw - 8))
+          call cftmdl1(64, a(128), w(nw - 32))
+          call cftf081(a(128), w(nw - 8))
+          call cftf082(a(144), w(nw - 8))
+          call cftf081(a(160), w(nw - 8))
+          call cftf081(a(176), w(nw - 8))
+          if (isplt .ne. 0) then
+              call cftmdl1(64, a(192), w(nw - 32))
+              call cftf081(a(240), w(nw - 8))
+          else
+              call cftmdl2(64, a(192), w(nw - 64))
+              call cftf082(a(240), w(nw - 8))
+          end if
+          call cftf081(a(192), w(nw - 8))
+          call cftf082(a(208), w(nw - 8))
+          call cftf081(a(224), w(nw - 8))
+      end if
+      end
+!
+      subroutine cftmdl1(n, a, w)
+      integer n, j, j0, j1, j2, j3, k, m, mh
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 wn4r, wk1r, wk1i, wk3r, wk3i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      mh = n / 8
+      m = 2 * mh
+      j1 = m
+      j2 = j1 + m
+      j3 = j2 + m
+      x0r = a(0) + a(j2)
+      x0i = a(1) + a(j2 + 1)
+      x1r = a(0) - a(j2)
+      x1i = a(1) - a(j2 + 1)
+      x2r = a(j1) + a(j3)
+      x2i = a(j1 + 1) + a(j3 + 1)
+      x3r = a(j1) - a(j3)
+      x3i = a(j1 + 1) - a(j3 + 1)
+      a(0) = x0r + x2r
+      a(1) = x0i + x2i
+      a(j1) = x0r - x2r
+      a(j1 + 1) = x0i - x2i
+      a(j2) = x1r - x3i
+      a(j2 + 1) = x1i + x3r
+      a(j3) = x1r + x3i
+      a(j3 + 1) = x1i - x3r
+      wn4r = w(1)
+      k = 0
+      do j = 2, mh - 2, 2
+          k = k + 4
+          wk1r = w(k)
+          wk1i = w(k + 1)
+          wk3r = w(k + 2)
+          wk3i = w(k + 3)
+          j1 = j + m
+          j2 = j1 + m
+          j3 = j2 + m
+          x0r = a(j) + a(j2)
+          x0i = a(j + 1) + a(j2 + 1)
+          x1r = a(j) - a(j2)
+          x1i = a(j + 1) - a(j2 + 1)
+          x2r = a(j1) + a(j3)
+          x2i = a(j1 + 1) + a(j3 + 1)
+          x3r = a(j1) - a(j3)
+          x3i = a(j1 + 1) - a(j3 + 1)
+          a(j) = x0r + x2r
+          a(j + 1) = x0i + x2i
+          a(j1) = x0r - x2r
+          a(j1 + 1) = x0i - x2i
+          x0r = x1r - x3i
+          x0i = x1i + x3r
+          a(j2) = wk1r * x0r - wk1i * x0i
+          a(j2 + 1) = wk1r * x0i + wk1i * x0r
+          x0r = x1r + x3i
+          x0i = x1i - x3r
+          a(j3) = wk3r * x0r + wk3i * x0i
+          a(j3 + 1) = wk3r * x0i - wk3i * x0r
+          j0 = m - j
+          j1 = j0 + m
+          j2 = j1 + m
+          j3 = j2 + m
+          x0r = a(j0) + a(j2)
+          x0i = a(j0 + 1) + a(j2 + 1)
+          x1r = a(j0) - a(j2)
+          x1i = a(j0 + 1) - a(j2 + 1)
+          x2r = a(j1) + a(j3)
+          x2i = a(j1 + 1) + a(j3 + 1)
+          x3r = a(j1) - a(j3)
+          x3i = a(j1 + 1) - a(j3 + 1)
+          a(j0) = x0r + x2r
+          a(j0 + 1) = x0i + x2i
+          a(j1) = x0r - x2r
+          a(j1 + 1) = x0i - x2i
+          x0r = x1r - x3i
+          x0i = x1i + x3r
+          a(j2) = wk1i * x0r - wk1r * x0i
+          a(j2 + 1) = wk1i * x0i + wk1r * x0r
+          x0r = x1r + x3i
+          x0i = x1i - x3r
+          a(j3) = wk3i * x0r + wk3r * x0i
+          a(j3 + 1) = wk3i * x0i - wk3r * x0r
+      end do
+      j0 = mh
+      j1 = j0 + m
+      j2 = j1 + m
+      j3 = j2 + m
+      x0r = a(j0) + a(j2)
+      x0i = a(j0 + 1) + a(j2 + 1)
+      x1r = a(j0) - a(j2)
+      x1i = a(j0 + 1) - a(j2 + 1)
+      x2r = a(j1) + a(j3)
+      x2i = a(j1 + 1) + a(j3 + 1)
+      x3r = a(j1) - a(j3)
+      x3i = a(j1 + 1) - a(j3 + 1)
+      a(j0) = x0r + x2r
+      a(j0 + 1) = x0i + x2i
+      a(j1) = x0r - x2r
+      a(j1 + 1) = x0i - x2i
+      x0r = x1r - x3i
+      x0i = x1i + x3r
+      a(j2) = wn4r * (x0r - x0i)
+      a(j2 + 1) = wn4r * (x0i + x0r)
+      x0r = x1r + x3i
+      x0i = x1i - x3r
+      a(j3) = -wn4r * (x0r + x0i)
+      a(j3 + 1) = -wn4r * (x0i - x0r)
+      end
+!
+      subroutine cftmdl2(n, a, w)
+      integer n, j, j0, j1, j2, j3, k, kr, m, mh
+      real*8 a(0 : n - 1), w(0 : *)
+      real*8 wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      real*8 y0r, y0i, y2r, y2i
+      mh = n / 8
+      m = 2 * mh
+      wn4r = w(1)
+      j1 = m
+      j2 = j1 + m
+      j3 = j2 + m
+      x0r = a(0) - a(j2 + 1)
+      x0i = a(1) + a(j2)
+      x1r = a(0) + a(j2 + 1)
+      x1i = a(1) - a(j2)
+      x2r = a(j1) - a(j3 + 1)
+      x2i = a(j1 + 1) + a(j3)
+      x3r = a(j1) + a(j3 + 1)
+      x3i = a(j1 + 1) - a(j3)
+      y0r = wn4r * (x2r - x2i)
+      y0i = wn4r * (x2i + x2r)
+      a(0) = x0r + y0r
+      a(1) = x0i + y0i
+      a(j1) = x0r - y0r
+      a(j1 + 1) = x0i - y0i
+      y0r = wn4r * (x3r - x3i)
+      y0i = wn4r * (x3i + x3r)
+      a(j2) = x1r - y0i
+      a(j2 + 1) = x1i + y0r
+      a(j3) = x1r + y0i
+      a(j3 + 1) = x1i - y0r
+      k = 0
+      kr = 2 * m
+      do j = 2, mh - 2, 2
+          k = k + 4
+          wk1r = w(k)
+          wk1i = w(k + 1)
+          wk3r = w(k + 2)
+          wk3i = w(k + 3)
+          kr = kr - 4
+          wd1i = w(kr)
+          wd1r = w(kr + 1)
+          wd3i = w(kr + 2)
+          wd3r = w(kr + 3)
+          j1 = j + m
+          j2 = j1 + m
+          j3 = j2 + m
+          x0r = a(j) - a(j2 + 1)
+          x0i = a(j + 1) + a(j2)
+          x1r = a(j) + a(j2 + 1)
+          x1i = a(j + 1) - a(j2)
+          x2r = a(j1) - a(j3 + 1)
+          x2i = a(j1 + 1) + a(j3)
+          x3r = a(j1) + a(j3 + 1)
+          x3i = a(j1 + 1) - a(j3)
+          y0r = wk1r * x0r - wk1i * x0i
+          y0i = wk1r * x0i + wk1i * x0r
+          y2r = wd1r * x2r - wd1i * x2i
+          y2i = wd1r * x2i + wd1i * x2r
+          a(j) = y0r + y2r
+          a(j + 1) = y0i + y2i
+          a(j1) = y0r - y2r
+          a(j1 + 1) = y0i - y2i
+          y0r = wk3r * x1r + wk3i * x1i
+          y0i = wk3r * x1i - wk3i * x1r
+          y2r = wd3r * x3r + wd3i * x3i
+          y2i = wd3r * x3i - wd3i * x3r
+          a(j2) = y0r + y2r
+          a(j2 + 1) = y0i + y2i
+          a(j3) = y0r - y2r
+          a(j3 + 1) = y0i - y2i
+          j0 = m - j
+          j1 = j0 + m
+          j2 = j1 + m
+          j3 = j2 + m
+          x0r = a(j0) - a(j2 + 1)
+          x0i = a(j0 + 1) + a(j2)
+          x1r = a(j0) + a(j2 + 1)
+          x1i = a(j0 + 1) - a(j2)
+          x2r = a(j1) - a(j3 + 1)
+          x2i = a(j1 + 1) + a(j3)
+          x3r = a(j1) + a(j3 + 1)
+          x3i = a(j1 + 1) - a(j3)
+          y0r = wd1i * x0r - wd1r * x0i
+          y0i = wd1i * x0i + wd1r * x0r
+          y2r = wk1i * x2r - wk1r * x2i
+          y2i = wk1i * x2i + wk1r * x2r
+          a(j0) = y0r + y2r
+          a(j0 + 1) = y0i + y2i
+          a(j1) = y0r - y2r
+          a(j1 + 1) = y0i - y2i
+          y0r = wd3i * x1r + wd3r * x1i
+          y0i = wd3i * x1i - wd3r * x1r
+          y2r = wk3i * x3r + wk3r * x3i
+          y2i = wk3i * x3i - wk3r * x3r
+          a(j2) = y0r + y2r
+          a(j2 + 1) = y0i + y2i
+          a(j3) = y0r - y2r
+          a(j3 + 1) = y0i - y2i
+      end do
+      wk1r = w(m)
+      wk1i = w(m + 1)
+      j0 = mh
+      j1 = j0 + m
+      j2 = j1 + m
+      j3 = j2 + m
+      x0r = a(j0) - a(j2 + 1)
+      x0i = a(j0 + 1) + a(j2)
+      x1r = a(j0) + a(j2 + 1)
+      x1i = a(j0 + 1) - a(j2)
+      x2r = a(j1) - a(j3 + 1)
+      x2i = a(j1 + 1) + a(j3)
+      x3r = a(j1) + a(j3 + 1)
+      x3i = a(j1 + 1) - a(j3)
+      y0r = wk1r * x0r - wk1i * x0i
+      y0i = wk1r * x0i + wk1i * x0r
+      y2r = wk1i * x2r - wk1r * x2i
+      y2i = wk1i * x2i + wk1r * x2r
+      a(j0) = y0r + y2r
+      a(j0 + 1) = y0i + y2i
+      a(j1) = y0r - y2r
+      a(j1 + 1) = y0i - y2i
+      y0r = wk1i * x1r - wk1r * x1i
+      y0i = wk1i * x1i + wk1r * x1r
+      y2r = wk1r * x3r - wk1i * x3i
+      y2i = wk1r * x3i + wk1i * x3r
+      a(j2) = y0r - y2r
+      a(j2 + 1) = y0i - y2i
+      a(j3) = y0r + y2r
+      a(j3 + 1) = y0i + y2i
+      end
+!
+      subroutine cftfx41(n, a, nw, w)
+      integer n, nw
+      real*8 a(0 : n - 1), w(0 : nw - 1)
+      if (n .eq. 128) then
+          call cftf161(a, w(nw - 8))
+          call cftf162(a(32), w(nw - 32))
+          call cftf161(a(64), w(nw - 8))
+          call cftf161(a(96), w(nw - 8))
+      else
+          call cftf081(a, w(nw - 8))
+          call cftf082(a(16), w(nw - 8))
+          call cftf081(a(32), w(nw - 8))
+          call cftf081(a(48), w(nw - 8))
+      end if
+      end
+!
+      subroutine cftf161(a, w)
+      real*8 a(0 : 31), w(0 : *), wn4r, wk1r, wk1i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+      real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
+      real*8 y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i
+      real*8 y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i
+      wn4r = w(1)
+      wk1r = w(2)
+      wk1i = w(3)
+      x0r = a(0) + a(16)
+      x0i = a(1) + a(17)
+      x1r = a(0) - a(16)
+      x1i = a(1) - a(17)
+      x2r = a(8) + a(24)
+      x2i = a(9) + a(25)
+      x3r = a(8) - a(24)
+      x3i = a(9) - a(25)
+      y0r = x0r + x2r
+      y0i = x0i + x2i
+      y4r = x0r - x2r
+      y4i = x0i - x2i
+      y8r = x1r - x3i
+      y8i = x1i + x3r
+      y12r = x1r + x3i
+      y12i = x1i - x3r
+      x0r = a(2) + a(18)
+      x0i = a(3) + a(19)
+      x1r = a(2) - a(18)
+      x1i = a(3) - a(19)
+      x2r = a(10) + a(26)
+      x2i = a(11) + a(27)
+      x3r = a(10) - a(26)
+      x3i = a(11) - a(27)
+      y1r = x0r + x2r
+      y1i = x0i + x2i
+      y5r = x0r - x2r
+      y5i = x0i - x2i
+      x0r = x1r - x3i
+      x0i = x1i + x3r
+      y9r = wk1r * x0r - wk1i * x0i
+      y9i = wk1r * x0i + wk1i * x0r
+      x0r = x1r + x3i
+      x0i = x1i - x3r
+      y13r = wk1i * x0r - wk1r * x0i
+      y13i = wk1i * x0i + wk1r * x0r
+      x0r = a(4) + a(20)
+      x0i = a(5) + a(21)
+      x1r = a(4) - a(20)
+      x1i = a(5) - a(21)
+      x2r = a(12) + a(28)
+      x2i = a(13) + a(29)
+      x3r = a(12) - a(28)
+      x3i = a(13) - a(29)
+      y2r = x0r + x2r
+      y2i = x0i + x2i
+      y6r = x0r - x2r
+      y6i = x0i - x2i
+      x0r = x1r - x3i
+      x0i = x1i + x3r
+      y10r = wn4r * (x0r - x0i)
+      y10i = wn4r * (x0i + x0r)
+      x0r = x1r + x3i
+      x0i = x1i - x3r
+      y14r = wn4r * (x0r + x0i)
+      y14i = wn4r * (x0i - x0r)
+      x0r = a(6) + a(22)
+      x0i = a(7) + a(23)
+      x1r = a(6) - a(22)
+      x1i = a(7) - a(23)
+      x2r = a(14) + a(30)
+      x2i = a(15) + a(31)
+      x3r = a(14) - a(30)
+      x3i = a(15) - a(31)
+      y3r = x0r + x2r
+      y3i = x0i + x2i
+      y7r = x0r - x2r
+      y7i = x0i - x2i
+      x0r = x1r - x3i
+      x0i = x1i + x3r
+      y11r = wk1i * x0r - wk1r * x0i
+      y11i = wk1i * x0i + wk1r * x0r
+      x0r = x1r + x3i
+      x0i = x1i - x3r
+      y15r = wk1r * x0r - wk1i * x0i
+      y15i = wk1r * x0i + wk1i * x0r
+      x0r = y12r - y14r
+      x0i = y12i - y14i
+      x1r = y12r + y14r
+      x1i = y12i + y14i
+      x2r = y13r - y15r
+      x2i = y13i - y15i
+      x3r = y13r + y15r
+      x3i = y13i + y15i
+      a(24) = x0r + x2r
+      a(25) = x0i + x2i
+      a(26) = x0r - x2r
+      a(27) = x0i - x2i
+      a(28) = x1r - x3i
+      a(29) = x1i + x3r
+      a(30) = x1r + x3i
+      a(31) = x1i - x3r
+      x0r = y8r + y10r
+      x0i = y8i + y10i
+      x1r = y8r - y10r
+      x1i = y8i - y10i
+      x2r = y9r + y11r
+      x2i = y9i + y11i
+      x3r = y9r - y11r
+      x3i = y9i - y11i
+      a(16) = x0r + x2r
+      a(17) = x0i + x2i
+      a(18) = x0r - x2r
+      a(19) = x0i - x2i
+      a(20) = x1r - x3i
+      a(21) = x1i + x3r
+      a(22) = x1r + x3i
+      a(23) = x1i - x3r
+      x0r = y5r - y7i
+      x0i = y5i + y7r
+      x2r = wn4r * (x0r - x0i)
+      x2i = wn4r * (x0i + x0r)
+      x0r = y5r + y7i
+      x0i = y5i - y7r
+      x3r = wn4r * (x0r - x0i)
+      x3i = wn4r * (x0i + x0r)
+      x0r = y4r - y6i
+      x0i = y4i + y6r
+      x1r = y4r + y6i
+      x1i = y4i - y6r
+      a(8) = x0r + x2r
+      a(9) = x0i + x2i
+      a(10) = x0r - x2r
+      a(11) = x0i - x2i
+      a(12) = x1r - x3i
+      a(13) = x1i + x3r
+      a(14) = x1r + x3i
+      a(15) = x1i - x3r
+      x0r = y0r + y2r
+      x0i = y0i + y2i
+      x1r = y0r - y2r
+      x1i = y0i - y2i
+      x2r = y1r + y3r
+      x2i = y1i + y3i
+      x3r = y1r - y3r
+      x3i = y1i - y3i
+      a(0) = x0r + x2r
+      a(1) = x0i + x2i
+      a(2) = x0r - x2r
+      a(3) = x0i - x2i
+      a(4) = x1r - x3i
+      a(5) = x1i + x3r
+      a(6) = x1r + x3i
+      a(7) = x1i - x3r
+      end
+!
+      subroutine cftf162(a, w)
+      real*8 a(0 : 31), w(0 : *)
+      real*8 wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+      real*8 x0r, x0i, x1r, x1i, x2r, x2i
+      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+      real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
+      real*8 y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i
+      real*8 y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i
+      wn4r = w(1)
+      wk1r = w(4)
+      wk1i = w(5)
+      wk3r = w(6)
+      wk3i = -w(7)
+      wk2r = w(8)
+      wk2i = w(9)
+      x1r = a(0) - a(17)
+      x1i = a(1) + a(16)
+      x0r = a(8) - a(25)
+      x0i = a(9) + a(24)
+      x2r = wn4r * (x0r - x0i)
+      x2i = wn4r * (x0i + x0r)
+      y0r = x1r + x2r
+      y0i = x1i + x2i
+      y4r = x1r - x2r
+      y4i = x1i - x2i
+      x1r = a(0) + a(17)
+      x1i = a(1) - a(16)
+      x0r = a(8) + a(25)
+      x0i = a(9) - a(24)
+      x2r = wn4r * (x0r - x0i)
+      x2i = wn4r * (x0i + x0r)
+      y8r = x1r - x2i
+      y8i = x1i + x2r
+      y12r = x1r + x2i
+      y12i = x1i - x2r
+      x0r = a(2) - a(19)
+      x0i = a(3) + a(18)
+      x1r = wk1r * x0r - wk1i * x0i
+      x1i = wk1r * x0i + wk1i * x0r
+      x0r = a(10) - a(27)
+      x0i = a(11) + a(26)
+      x2r = wk3i * x0r - wk3r * x0i
+      x2i = wk3i * x0i + wk3r * x0r
+      y1r = x1r + x2r
+      y1i = x1i + x2i
+      y5r = x1r - x2r
+      y5i = x1i - x2i
+      x0r = a(2) + a(19)
+      x0i = a(3) - a(18)
+      x1r = wk3r * x0r - wk3i * x0i
+      x1i = wk3r * x0i + wk3i * x0r
+      x0r = a(10) + a(27)
+      x0i = a(11) - a(26)
+      x2r = wk1r * x0r + wk1i * x0i
+      x2i = wk1r * x0i - wk1i * x0r
+      y9r = x1r - x2r
+      y9i = x1i - x2i
+      y13r = x1r + x2r
+      y13i = x1i + x2i
+      x0r = a(4) - a(21)
+      x0i = a(5) + a(20)
+      x1r = wk2r * x0r - wk2i * x0i
+      x1i = wk2r * x0i + wk2i * x0r
+      x0r = a(12) - a(29)
+      x0i = a(13) + a(28)
+      x2r = wk2i * x0r - wk2r * x0i
+      x2i = wk2i * x0i + wk2r * x0r
+      y2r = x1r + x2r
+      y2i = x1i + x2i
+      y6r = x1r - x2r
+      y6i = x1i - x2i
+      x0r = a(4) + a(21)
+      x0i = a(5) - a(20)
+      x1r = wk2i * x0r - wk2r * x0i
+      x1i = wk2i * x0i + wk2r * x0r
+      x0r = a(12) + a(29)
+      x0i = a(13) - a(28)
+      x2r = wk2r * x0r - wk2i * x0i
+      x2i = wk2r * x0i + wk2i * x0r
+      y10r = x1r - x2r
+      y10i = x1i - x2i
+      y14r = x1r + x2r
+      y14i = x1i + x2i
+      x0r = a(6) - a(23)
+      x0i = a(7) + a(22)
+      x1r = wk3r * x0r - wk3i * x0i
+      x1i = wk3r * x0i + wk3i * x0r
+      x0r = a(14) - a(31)
+      x0i = a(15) + a(30)
+      x2r = wk1i * x0r - wk1r * x0i
+      x2i = wk1i * x0i + wk1r * x0r
+      y3r = x1r + x2r
+      y3i = x1i + x2i
+      y7r = x1r - x2r
+      y7i = x1i - x2i
+      x0r = a(6) + a(23)
+      x0i = a(7) - a(22)
+      x1r = wk1i * x0r + wk1r * x0i
+      x1i = wk1i * x0i - wk1r * x0r
+      x0r = a(14) + a(31)
+      x0i = a(15) - a(30)
+      x2r = wk3i * x0r - wk3r * x0i
+      x2i = wk3i * x0i + wk3r * x0r
+      y11r = x1r + x2r
+      y11i = x1i + x2i
+      y15r = x1r - x2r
+      y15i = x1i - x2i
+      x1r = y0r + y2r
+      x1i = y0i + y2i
+      x2r = y1r + y3r
+      x2i = y1i + y3i
+      a(0) = x1r + x2r
+      a(1) = x1i + x2i
+      a(2) = x1r - x2r
+      a(3) = x1i - x2i
+      x1r = y0r - y2r
+      x1i = y0i - y2i
+      x2r = y1r - y3r
+      x2i = y1i - y3i
+      a(4) = x1r - x2i
+      a(5) = x1i + x2r
+      a(6) = x1r + x2i
+      a(7) = x1i - x2r
+      x1r = y4r - y6i
+      x1i = y4i + y6r
+      x0r = y5r - y7i
+      x0i = y5i + y7r
+      x2r = wn4r * (x0r - x0i)
+      x2i = wn4r * (x0i + x0r)
+      a(8) = x1r + x2r
+      a(9) = x1i + x2i
+      a(10) = x1r - x2r
+      a(11) = x1i - x2i
+      x1r = y4r + y6i
+      x1i = y4i - y6r
+      x0r = y5r + y7i
+      x0i = y5i - y7r
+      x2r = wn4r * (x0r - x0i)
+      x2i = wn4r * (x0i + x0r)
+      a(12) = x1r - x2i
+      a(13) = x1i + x2r
+      a(14) = x1r + x2i
+      a(15) = x1i - x2r
+      x1r = y8r + y10r
+      x1i = y8i + y10i
+      x2r = y9r - y11r
+      x2i = y9i - y11i
+      a(16) = x1r + x2r
+      a(17) = x1i + x2i
+      a(18) = x1r - x2r
+      a(19) = x1i - x2i
+      x1r = y8r - y10r
+      x1i = y8i - y10i
+      x2r = y9r + y11r
+      x2i = y9i + y11i
+      a(20) = x1r - x2i
+      a(21) = x1i + x2r
+      a(22) = x1r + x2i
+      a(23) = x1i - x2r
+      x1r = y12r - y14i
+      x1i = y12i + y14r
+      x0r = y13r + y15i
+      x0i = y13i - y15r
+      x2r = wn4r * (x0r - x0i)
+      x2i = wn4r * (x0i + x0r)
+      a(24) = x1r + x2r
+      a(25) = x1i + x2i
+      a(26) = x1r - x2r
+      a(27) = x1i - x2i
+      x1r = y12r + y14i
+      x1i = y12i - y14r
+      x0r = y13r - y15i
+      x0i = y13i + y15r
+      x2r = wn4r * (x0r - x0i)
+      x2i = wn4r * (x0i + x0r)
+      a(28) = x1r - x2i
+      a(29) = x1i + x2r
+      a(30) = x1r + x2i
+      a(31) = x1i - x2r
+      end
+!
+      subroutine cftf081(a, w)
+      real*8 a(0 : 15), w(0 : *)
+      real*8 wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+      real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
+      wn4r = w(1)
+      x0r = a(0) + a(8)
+      x0i = a(1) + a(9)
+      x1r = a(0) - a(8)
+      x1i = a(1) - a(9)
+      x2r = a(4) + a(12)
+      x2i = a(5) + a(13)
+      x3r = a(4) - a(12)
+      x3i = a(5) - a(13)
+      y0r = x0r + x2r
+      y0i = x0i + x2i
+      y2r = x0r - x2r
+      y2i = x0i - x2i
+      y1r = x1r - x3i
+      y1i = x1i + x3r
+      y3r = x1r + x3i
+      y3i = x1i - x3r
+      x0r = a(2) + a(10)
+      x0i = a(3) + a(11)
+      x1r = a(2) - a(10)
+      x1i = a(3) - a(11)
+      x2r = a(6) + a(14)
+      x2i = a(7) + a(15)
+      x3r = a(6) - a(14)
+      x3i = a(7) - a(15)
+      y4r = x0r + x2r
+      y4i = x0i + x2i
+      y6r = x0r - x2r
+      y6i = x0i - x2i
+      x0r = x1r - x3i
+      x0i = x1i + x3r
+      x2r = x1r + x3i
+      x2i = x1i - x3r
+      y5r = wn4r * (x0r - x0i)
+      y5i = wn4r * (x0r + x0i)
+      y7r = wn4r * (x2r - x2i)
+      y7i = wn4r * (x2r + x2i)
+      a(8) = y1r + y5r
+      a(9) = y1i + y5i
+      a(10) = y1r - y5r
+      a(11) = y1i - y5i
+      a(12) = y3r - y7i
+      a(13) = y3i + y7r
+      a(14) = y3r + y7i
+      a(15) = y3i - y7r
+      a(0) = y0r + y4r
+      a(1) = y0i + y4i
+      a(2) = y0r - y4r
+      a(3) = y0i - y4i
+      a(4) = y2r - y6i
+      a(5) = y2i + y6r
+      a(6) = y2r + y6i
+      a(7) = y2i - y6r
+      end
+!
+      subroutine cftf082(a, w)
+      real*8 a(0 : 15), w(0 : *)
+      real*8 wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i
+      real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+      real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
+      wn4r = w(1)
+      wk1r = w(2)
+      wk1i = w(3)
+      y0r = a(0) - a(9)
+      y0i = a(1) + a(8)
+      y1r = a(0) + a(9)
+      y1i = a(1) - a(8)
+      x0r = a(4) - a(13)
+      x0i = a(5) + a(12)
+      y2r = wn4r * (x0r - x0i)
+      y2i = wn4r * (x0i + x0r)
+      x0r = a(4) + a(13)
+      x0i = a(5) - a(12)
+      y3r = wn4r * (x0r - x0i)
+      y3i = wn4r * (x0i + x0r)
+      x0r = a(2) - a(11)
+      x0i = a(3) + a(10)
+      y4r = wk1r * x0r - wk1i * x0i
+      y4i = wk1r * x0i + wk1i * x0r
+      x0r = a(2) + a(11)
+      x0i = a(3) - a(10)
+      y5r = wk1i * x0r - wk1r * x0i
+      y5i = wk1i * x0i + wk1r * x0r
+      x0r = a(6) - a(15)
+      x0i = a(7) + a(14)
+      y6r = wk1i * x0r - wk1r * x0i
+      y6i = wk1i * x0i + wk1r * x0r
+      x0r = a(6) + a(15)
+      x0i = a(7) - a(14)
+      y7r = wk1r * x0r - wk1i * x0i
+      y7i = wk1r * x0i + wk1i * x0r
+      x0r = y0r + y2r
+      x0i = y0i + y2i
+      x1r = y4r + y6r
+      x1i = y4i + y6i
+      a(0) = x0r + x1r
+      a(1) = x0i + x1i
+      a(2) = x0r - x1r
+      a(3) = x0i - x1i
+      x0r = y0r - y2r
+      x0i = y0i - y2i
+      x1r = y4r - y6r
+      x1i = y4i - y6i
+      a(4) = x0r - x1i
+      a(5) = x0i + x1r
+      a(6) = x0r + x1i
+      a(7) = x0i - x1r
+      x0r = y1r - y3i
+      x0i = y1i + y3r
+      x1r = y5r - y7r
+      x1i = y5i - y7i
+      a(8) = x0r + x1r
+      a(9) = x0i + x1i
+      a(10) = x0r - x1r
+      a(11) = x0i - x1i
+      x0r = y1r + y3i
+      x0i = y1i - y3r
+      x1r = y5r + y7r
+      x1i = y5i + y7i
+      a(12) = x0r - x1i
+      a(13) = x0i + x1r
+      a(14) = x0r + x1i
+      a(15) = x0i - x1r
+      end
+!
+      subroutine cftf040(a)
+      real*8 a(0 : 7), x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      x0r = a(0) + a(4)
+      x0i = a(1) + a(5)
+      x1r = a(0) - a(4)
+      x1i = a(1) - a(5)
+      x2r = a(2) + a(6)
+      x2i = a(3) + a(7)
+      x3r = a(2) - a(6)
+      x3i = a(3) - a(7)
+      a(0) = x0r + x2r
+      a(1) = x0i + x2i
+      a(2) = x1r - x3i
+      a(3) = x1i + x3r
+      a(4) = x0r - x2r
+      a(5) = x0i - x2i
+      a(6) = x1r + x3i
+      a(7) = x1i - x3r
+      end
+!
+      subroutine cftb040(a)
+      real*8 a(0 : 7), x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+      x0r = a(0) + a(4)
+      x0i = a(1) + a(5)
+      x1r = a(0) - a(4)
+      x1i = a(1) - a(5)
+      x2r = a(2) + a(6)
+      x2i = a(3) + a(7)
+      x3r = a(2) - a(6)
+      x3i = a(3) - a(7)
+      a(0) = x0r + x2r
+      a(1) = x0i + x2i
+      a(2) = x1r + x3i
+      a(3) = x1i - x3r
+      a(4) = x0r - x2r
+      a(5) = x0i - x2i
+      a(6) = x1r - x3i
+      a(7) = x1i + x3r
+      end
+!
+      subroutine cftx020(a)
+      real*8 a(0 : 3), x0r, x0i
+      x0r = a(0) - a(2)
+      x0i = a(1) - a(3)
+      a(0) = a(0) + a(2)
+      a(1) = a(1) + a(3)
+      a(2) = x0r
+      a(3) = x0i
+      end
+!
+      subroutine rftfsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr, xi, yr, yi
+      m = n / 2
+      ks = 2 * nc / m
+      kk = 0
+      do j = 2, m - 2, 2
+          k = n - j
+          kk = kk + ks
+          wkr = 0.5d0 - c(nc - kk)
+          wki = c(kk)
+          xr = a(j) - a(k)
+          xi = a(j + 1) + a(k + 1)
+          yr = wkr * xr - wki * xi
+          yi = wkr * xi + wki * xr
+          a(j) = a(j) - yr
+          a(j + 1) = a(j + 1) - yi
+          a(k) = a(k) + yr
+          a(k + 1) = a(k + 1) - yi
+      end do
+      end
+!
+      subroutine rftbsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr, xi, yr, yi
+      m = n / 2
+      ks = 2 * nc / m
+      kk = 0
+      do j = 2, m - 2, 2
+          k = n - j
+          kk = kk + ks
+          wkr = 0.5d0 - c(nc - kk)
+          wki = c(kk)
+          xr = a(j) - a(k)
+          xi = a(j + 1) + a(k + 1)
+          yr = wkr * xr + wki * xi
+          yi = wkr * xi - wki * xr
+          a(j) = a(j) - yr
+          a(j + 1) = a(j + 1) - yi
+          a(k) = a(k) + yr
+          a(k + 1) = a(k + 1) - yi
+      end do
+      end
+!
+      subroutine dctsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr
+      m = n / 2
+      ks = nc / n
+      kk = 0
+      do j = 1, m - 1
+          k = n - j
+          kk = kk + ks
+          wkr = c(kk) - c(nc - kk)
+          wki = c(kk) + c(nc - kk)
+          xr = wki * a(j) - wkr * a(k)
+          a(j) = wkr * a(j) + wki * a(k)
+          a(k) = xr
+      end do
+      a(m) = c(0) * a(m)
+      end
+!
+      subroutine dstsub(n, a, nc, c)
+      integer n, nc, j, k, kk, ks, m
+      real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr
+      m = n / 2
+      ks = nc / n
+      kk = 0
+      do j = 1, m - 1
+          k = n - j
+          kk = kk + ks
+          wkr = c(kk) - c(nc - kk)
+          wki = c(kk) + c(nc - kk)
+          xr = wki * a(k) - wkr * a(j)
+          a(k) = wkr * a(k) + wki * a(j)
+          a(j) = xr
+      end do
+      a(m) = c(0) * a(m)
+      end
+!
diff --git a/fftsg2d.c b/fftsg2d.c
new file mode 100644
index 0000000..2ebfa93
--- /dev/null
+++ b/fftsg2d.c
@@ -0,0 +1,1190 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :two
+    data length :power of 2
+    decimation  :frequency
+    radix       :split-radix, row-column
+    data        :inplace
+    table       :use
+functions
+    cdft2d: Complex Discrete Fourier Transform
+    rdft2d: Real Discrete Fourier Transform
+    ddct2d: Discrete Cosine Transform
+    ddst2d: Discrete Sine Transform
+function prototypes
+    void cdft2d(int, int, int, double **, double *, int *, double *);
+    void rdft2d(int, int, int, double **, double *, int *, double *);
+    void rdft2dsort(int, int, int, double **);
+    void ddct2d(int, int, int, double **, double *, int *, double *);
+    void ddst2d(int, int, int, double **, double *, int *, double *);
+necessary package
+    fftsg.c  : 1D-FFT package
+macro definitions
+    USE_FFT2D_PTHREADS : default=not defined
+        FFT2D_MAX_THREADS     : must be 2^N, default=4
+        FFT2D_THREADS_BEGIN_N : default=65536
+    USE_FFT2D_WINTHREADS : default=not defined
+        FFT2D_MAX_THREADS     : must be 2^N, default=4
+        FFT2D_THREADS_BEGIN_N : default=131072
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * 
+                            exp(2*pi*i*j1*k1/n1) * 
+                            exp(2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
+        <case2>
+            X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * 
+                            exp(-2*pi*i*j1*k1/n1) * 
+                            exp(-2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft2d(n1, 2*n2, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft2d(n1, 2*n2, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 1, n1 = power of 2
+        2*n2   :data length (int)
+                n2 >= 1, n2 = power of 2
+        a[0...n1-1][0...2*n2-1]
+               :input/output data (double **)
+                input data
+                    a[j1][2*j2] = Re(x[j1][j2]), 
+                    a[j1][2*j2+1] = Im(x[j1][j2]), 
+                    0<=j1<n1, 0<=j2<n2
+                output data
+                    a[k1][2*k2] = Re(X[k1][k2]), 
+                    a[k1][2*k2+1] = Im(X[k1][k2]), 
+                    0<=k1<n1, 0<=k2<n2
+        t[0...*]
+               :work area (double *)
+                length of t >= 8*n1,                   if single thread, 
+                length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads, 
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1, n2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1/2, n2/2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            cdft2d(n1, 2*n2, -1, a, t, ip, w);
+        is 
+            cdft2d(n1, 2*n2, 1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= 2 * n2 - 1; j2++) {
+                    a[j1][j2] *= 1.0 / n1 / n2;
+                }
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
+                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
+                            0<=k1<n1, 0<=k2<n2
+            I[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
+                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
+                            0<=k1<n1, 0<=k2<n2
+        <case2> IRDFT (excluding scale)
+            a[k1][k2] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
+                            (R[j1][j2] * 
+                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) + 
+                            I[j1][j2] * 
+                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)), 
+                            0<=k1<n1, 0<=k2<n2
+        (notes: R[n1-k1][n2-k2] = R[k1][k2], 
+                I[n1-k1][n2-k2] = -I[k1][k2], 
+                R[n1-k1][0] = R[k1][0], 
+                I[n1-k1][0] = -I[k1][0], 
+                R[0][n2-k2] = R[0][k2], 
+                I[0][n2-k2] = -I[0][k2], 
+                0<k1<n1, 0<k2<n2)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft2d(n1, n2, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft2d(n1, n2, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        a[0...n1-1][0...n2-1]
+               :input/output data (double **)
+                <case1>
+                    output data
+                        a[k1][2*k2] = R[k1][k2] = R[n1-k1][n2-k2], 
+                        a[k1][2*k2+1] = I[k1][k2] = -I[n1-k1][n2-k2], 
+                            0<k1<n1, 0<k2<n2/2, 
+                        a[0][2*k2] = R[0][k2] = R[0][n2-k2], 
+                        a[0][2*k2+1] = I[0][k2] = -I[0][n2-k2], 
+                            0<k2<n2/2, 
+                        a[k1][0] = R[k1][0] = R[n1-k1][0], 
+                        a[k1][1] = I[k1][0] = -I[n1-k1][0], 
+                        a[n1-k1][1] = R[k1][n2/2] = R[n1-k1][n2/2], 
+                        a[n1-k1][0] = -I[k1][n2/2] = I[n1-k1][n2/2], 
+                            0<k1<n1/2, 
+                        a[0][0] = R[0][0], 
+                        a[0][1] = R[0][n2/2], 
+                        a[n1/2][0] = R[n1/2][0], 
+                        a[n1/2][1] = R[n1/2][n2/2]
+                <case2>
+                    input data
+                        a[j1][2*j2] = R[j1][j2] = R[n1-j1][n2-j2], 
+                        a[j1][2*j2+1] = I[j1][j2] = -I[n1-j1][n2-j2], 
+                            0<j1<n1, 0<j2<n2/2, 
+                        a[0][2*j2] = R[0][j2] = R[0][n2-j2], 
+                        a[0][2*j2+1] = I[0][j2] = -I[0][n2-j2], 
+                            0<j2<n2/2, 
+                        a[j1][0] = R[j1][0] = R[n1-j1][0], 
+                        a[j1][1] = I[j1][0] = -I[n1-j1][0], 
+                        a[n1-j1][1] = R[j1][n2/2] = R[n1-j1][n2/2], 
+                        a[n1-j1][0] = -I[j1][n2/2] = I[n1-j1][n2/2], 
+                            0<j1<n1/2, 
+                        a[0][0] = R[0][0], 
+                        a[0][1] = R[0][n2/2], 
+                        a[n1/2][0] = R[n1/2][0], 
+                        a[n1/2][1] = R[n1/2][n2/2]
+                ---- output ordering ----
+                    rdft2d(n1, n2, 1, a, t, ip, w);
+                    rdft2dsort(n1, n2, 1, a);
+                    // stored data is a[0...n1-1][0...n2+1]:
+                    // a[k1][2*k2] = R[k1][k2], 
+                    // a[k1][2*k2+1] = I[k1][k2], 
+                    // 0<=k1<n1, 0<=k2<=n2/2.
+                    // the stored data is larger than the input data!
+                ---- input ordering ----
+                    rdft2dsort(n1, n2, -1, a);
+                    rdft2d(n1, n2, -1, a, t, ip, w);
+        t[0...*]
+               :work area (double *)
+                length of t >= 8*n1,                   if single thread, 
+                length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads, 
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1, n2/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1/2, n2/4) + n2/4
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            rdft2d(n1, n2, 1, a, t, ip, w);
+        is 
+            rdft2d(n1, n2, -1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    a[j1][j2] *= 2.0 / n1 / n2;
+                }
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
+                            cos(pi*j1*(k1+1/2)/n1) * 
+                            cos(pi*j2*(k2+1/2)/n2), 
+                            0<=k1<n1, 0<=k2<n2
+        <case2> DCT
+            C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
+                            cos(pi*(j1+1/2)*k1/n1) * 
+                            cos(pi*(j2+1/2)*k2/n2), 
+                            0<=k1<n1, 0<=k2<n2
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct2d(n1, n2, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct2d(n1, n2, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        a[0...n1-1][0...n2-1]
+               :input/output data (double **)
+                output data
+                    a[k1][k2] = C[k1][k2], 0<=k1<n1, 0<=k2<n2
+        t[0...*]
+               :work area (double *)
+                length of t >= 4*n1,                   if single thread, 
+                length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads, 
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1/2, n2/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1*3/2, n2*3/2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddct2d(n1, n2, -1, a, t, ip, w);
+        is 
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                a[j1][0] *= 0.5;
+            }
+            for (j2 = 0; j2 <= n2 - 1; j2++) {
+                a[0][j2] *= 0.5;
+            }
+            ddct2d(n1, n2, 1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    a[j1][j2] *= 4.0 / n1 / n2;
+                }
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k1][k2] = sum_j1=1^n1 sum_j2=1^n2 A[j1][j2] * 
+                            sin(pi*j1*(k1+1/2)/n1) * 
+                            sin(pi*j2*(k2+1/2)/n2), 
+                            0<=k1<n1, 0<=k2<n2
+        <case2> DST
+            S[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * 
+                            sin(pi*(j1+1/2)*k1/n1) * 
+                            sin(pi*(j2+1/2)*k2/n2), 
+                            0<k1<=n1, 0<k2<=n2
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst2d(n1, n2, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst2d(n1, n2, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        a[0...n1-1][0...n2-1]
+               :input/output data (double **)
+                <case1>
+                    input data
+                        a[j1][j2] = A[j1][j2], 0<j1<n1, 0<j2<n2, 
+                        a[j1][0] = A[j1][n2], 0<j1<n1, 
+                        a[0][j2] = A[n1][j2], 0<j2<n2, 
+                        a[0][0] = A[n1][n2]
+                        (i.e. A[j1][j2] = a[j1 % n1][j2 % n2])
+                    output data
+                        a[k1][k2] = S[k1][k2], 0<=k1<n1, 0<=k2<n2
+                <case2>
+                    output data
+                        a[k1][k2] = S[k1][k2], 0<k1<n1, 0<k2<n2, 
+                        a[k1][0] = S[k1][n2], 0<k1<n1, 
+                        a[0][k2] = S[n1][k2], 0<k2<n2, 
+                        a[0][0] = S[n1][n2]
+                        (i.e. S[k1][k2] = a[k1 % n1][k2 % n2])
+        t[0...*]
+               :work area (double *)
+                length of t >= 4*n1,                   if single thread, 
+                length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads, 
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1/2, n2/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1*3/2, n2*3/2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddst2d(n1, n2, -1, a, t, ip, w);
+        is 
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                a[j1][0] *= 0.5;
+            }
+            for (j2 = 0; j2 <= n2 - 1; j2++) {
+                a[0][j2] *= 0.5;
+            }
+            ddst2d(n1, n2, 1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    a[j1][j2] *= 4.0 / n1 / n2;
+                }
+            }
+        .
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#define fft2d_alloc_error_check(p) { \
+    if ((p) == NULL) { \
+        fprintf(stderr, "fft2d memory allocation error\n"); \
+        exit(1); \
+    } \
+}
+
+
+#ifdef USE_FFT2D_PTHREADS
+#define USE_FFT2D_THREADS
+#ifndef FFT2D_MAX_THREADS
+#define FFT2D_MAX_THREADS 4
+#endif
+#ifndef FFT2D_THREADS_BEGIN_N
+#define FFT2D_THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#define fft2d_thread_t pthread_t
+#define fft2d_thread_create(thp,func,argp) { \
+    if (pthread_create(thp, NULL, func, (void *) (argp)) != 0) { \
+        fprintf(stderr, "fft2d thread error\n"); \
+        exit(1); \
+    } \
+}
+#define fft2d_thread_wait(th) { \
+    if (pthread_join(th, NULL) != 0) { \
+        fprintf(stderr, "fft2d thread error\n"); \
+        exit(1); \
+    } \
+}
+#endif /* USE_FFT2D_PTHREADS */
+
+
+#ifdef USE_FFT2D_WINTHREADS
+#define USE_FFT2D_THREADS
+#ifndef FFT2D_MAX_THREADS
+#define FFT2D_MAX_THREADS 4
+#endif
+#ifndef FFT2D_THREADS_BEGIN_N
+#define FFT2D_THREADS_BEGIN_N 131072
+#endif
+#include <windows.h>
+#define fft2d_thread_t HANDLE
+#define fft2d_thread_create(thp,func,argp) { \
+    DWORD thid; \
+    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) (func), (LPVOID) (argp), 0, &thid); \
+    if (*(thp) == 0) { \
+        fprintf(stderr, "fft2d thread error\n"); \
+        exit(1); \
+    } \
+}
+#define fft2d_thread_wait(th) { \
+    WaitForSingleObject(th, INFINITE); \
+    CloseHandle(th); \
+}
+#endif /* USE_FFT2D_WINTHREADS */
+
+
+void cdft2d(int n1, int n2, int isgn, double **a, double *t, 
+    int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void cdft(int n, int isgn, double *a, int *ip, double *w);
+    void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t, 
+        int *ip, double *w);
+#ifdef USE_FFT2D_THREADS
+    void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a, 
+        int *ip, double *w);
+    void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t, 
+        int *ip, double *w);
+#endif /* USE_FFT2D_THREADS */
+    int n, itnull, nthread, nt, i;
+    
+    n = n1 << 1;
+    if (n < n2) {
+        n = n2;
+    }
+    if (n > (ip[0] << 2)) {
+        makewt(n >> 2, ip, w);
+    }
+    itnull = 0;
+    if (t == NULL) {
+        itnull = 1;
+        nthread = 1;
+#ifdef USE_FFT2D_THREADS
+        nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+        nt = 8 * nthread * n1;
+        if (n2 == 4 * nthread) {
+            nt >>= 1;
+        } else if (n2 < 4 * nthread) {
+            nt >>= 2;
+        }
+        t = (double *) malloc(sizeof(double) * nt);
+        fft2d_alloc_error_check(t);
+    }
+#ifdef USE_FFT2D_THREADS
+    if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
+        xdft2d0_subth(n1, n2, 0, isgn, a, ip, w);
+        cdft2d_subth(n1, n2, isgn, a, t, ip, w);
+    } else 
+#endif /* USE_FFT2D_THREADS */
+    {
+        for (i = 0; i < n1; i++) {
+            cdft(n2, isgn, a[i], ip, w);
+        }
+        cdft2d_sub(n1, n2, isgn, a, t, ip, w);
+    }
+    if (itnull != 0) {
+        free(t);
+    }
+}
+
+
+void rdft2d(int n1, int n2, int isgn, double **a, double *t, 
+    int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t, 
+        int *ip, double *w);
+    void rdft2d_sub(int n1, int n2, int isgn, double **a);
+#ifdef USE_FFT2D_THREADS
+    void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a, 
+        int *ip, double *w);
+    void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t, 
+        int *ip, double *w);
+#endif /* USE_FFT2D_THREADS */
+    int n, nw, nc, itnull, nthread, nt, i;
+    
+    n = n1 << 1;
+    if (n < n2) {
+        n = n2;
+    }
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n2 > (nc << 2)) {
+        nc = n2 >> 2;
+        makect(nc, ip, w + nw);
+    }
+    itnull = 0;
+    if (t == NULL) {
+        itnull = 1;
+        nthread = 1;
+#ifdef USE_FFT2D_THREADS
+        nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+        nt = 8 * nthread * n1;
+        if (n2 == 4 * nthread) {
+            nt >>= 1;
+        } else if (n2 < 4 * nthread) {
+            nt >>= 2;
+        }
+        t = (double *) malloc(sizeof(double) * nt);
+        fft2d_alloc_error_check(t);
+    }
+#ifdef USE_FFT2D_THREADS
+    if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
+        if (isgn < 0) {
+            rdft2d_sub(n1, n2, isgn, a);
+            cdft2d_subth(n1, n2, isgn, a, t, ip, w);
+        }
+        xdft2d0_subth(n1, n2, 1, isgn, a, ip, w);
+        if (isgn >= 0) {
+            cdft2d_subth(n1, n2, isgn, a, t, ip, w);
+            rdft2d_sub(n1, n2, isgn, a);
+        }
+    } else 
+#endif /* USE_FFT2D_THREADS */
+    {
+        if (isgn < 0) {
+            rdft2d_sub(n1, n2, isgn, a);
+            cdft2d_sub(n1, n2, isgn, a, t, ip, w);
+        }
+        for (i = 0; i < n1; i++) {
+            rdft(n2, isgn, a[i], ip, w);
+        }
+        if (isgn >= 0) {
+            cdft2d_sub(n1, n2, isgn, a, t, ip, w);
+            rdft2d_sub(n1, n2, isgn, a);
+        }
+    }
+    if (itnull != 0) {
+        free(t);
+    }
+}
+
+
+void rdft2dsort(int n1, int n2, int isgn, double **a)
+{
+    int n1h, i;
+    double x, y;
+    
+    n1h = n1 >> 1;
+    if (isgn < 0) {
+        for (i = n1h + 1; i < n1; i++) {
+            a[i][0] = a[i][n2 + 1];
+            a[i][1] = a[i][n2];
+        }
+        a[0][1] = a[0][n2];
+        a[n1h][1] = a[n1h][n2];
+    } else {
+        for (i = n1h + 1; i < n1; i++) {
+            y = a[i][0];
+            x = a[i][1];
+            a[i][n2] = x;
+            a[i][n2 + 1] = y;
+            a[n1 - i][n2] = x;
+            a[n1 - i][n2 + 1] = -y;
+            a[i][0] = a[n1 - i][0];
+            a[i][1] = -a[n1 - i][1];
+        }
+        a[0][n2] = a[0][1];
+        a[0][n2 + 1] = 0;
+        a[0][1] = 0;
+        a[n1h][n2] = a[n1h][1];
+        a[n1h][n2 + 1] = 0;
+        a[n1h][1] = 0;
+    }
+}
+
+
+void ddct2d(int n1, int n2, int isgn, double **a, double *t, 
+    int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void ddct(int n, int isgn, double *a, int *ip, double *w);
+    void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a, 
+        double *t, int *ip, double *w);
+#ifdef USE_FFT2D_THREADS
+    void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a, 
+        int *ip, double *w);
+    void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a, 
+        double *t, int *ip, double *w);
+#endif /* USE_FFT2D_THREADS */
+    int n, nw, nc, itnull, nthread, nt, i;
+    
+    n = n1;
+    if (n < n2) {
+        n = n2;
+    }
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    itnull = 0;
+    if (t == NULL) {
+        itnull = 1;
+        nthread = 1;
+#ifdef USE_FFT2D_THREADS
+        nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+        nt = 4 * nthread * n1;
+        if (n2 == 2 * nthread) {
+            nt >>= 1;
+        } else if (n2 < 2 * nthread) {
+            nt >>= 2;
+        }
+        t = (double *) malloc(sizeof(double) * nt);
+        fft2d_alloc_error_check(t);
+    }
+#ifdef USE_FFT2D_THREADS
+    if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
+        ddxt2d0_subth(n1, n2, 0, isgn, a, ip, w);
+        ddxt2d_subth(n1, n2, 0, isgn, a, t, ip, w);
+    } else 
+#endif /* USE_FFT2D_THREADS */
+    {
+        for (i = 0; i < n1; i++) {
+            ddct(n2, isgn, a[i], ip, w);
+        }
+        ddxt2d_sub(n1, n2, 0, isgn, a, t, ip, w);
+    }
+    if (itnull != 0) {
+        free(t);
+    }
+}
+
+
+void ddst2d(int n1, int n2, int isgn, double **a, double *t, 
+    int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void ddst(int n, int isgn, double *a, int *ip, double *w);
+    void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a, 
+        double *t, int *ip, double *w);
+#ifdef USE_FFT2D_THREADS
+    void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a, 
+        int *ip, double *w);
+    void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a, 
+        double *t, int *ip, double *w);
+#endif /* USE_FFT2D_THREADS */
+    int n, nw, nc, itnull, nthread, nt, i;
+    
+    n = n1;
+    if (n < n2) {
+        n = n2;
+    }
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    itnull = 0;
+    if (t == NULL) {
+        itnull = 1;
+        nthread = 1;
+#ifdef USE_FFT2D_THREADS
+        nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+        nt = 4 * nthread * n1;
+        if (n2 == 2 * nthread) {
+            nt >>= 1;
+        } else if (n2 < 2 * nthread) {
+            nt >>= 2;
+        }
+        t = (double *) malloc(sizeof(double) * nt);
+        fft2d_alloc_error_check(t);
+    }
+#ifdef USE_FFT2D_THREADS
+    if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
+        ddxt2d0_subth(n1, n2, 1, isgn, a, ip, w);
+        ddxt2d_subth(n1, n2, 1, isgn, a, t, ip, w);
+    } else 
+#endif /* USE_FFT2D_THREADS */
+    {
+        for (i = 0; i < n1; i++) {
+            ddst(n2, isgn, a[i], ip, w);
+        }
+        ddxt2d_sub(n1, n2, 1, isgn, a, t, ip, w);
+    }
+    if (itnull != 0) {
+        free(t);
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t, 
+    int *ip, double *w)
+{
+    void cdft(int n, int isgn, double *a, int *ip, double *w);
+    int i, j;
+    
+    if (n2 > 4) {
+        for (j = 0; j < n2; j += 8) {
+            for (i = 0; i < n1; i++) {
+                t[2 * i] = a[i][j];
+                t[2 * i + 1] = a[i][j + 1];
+                t[2 * n1 + 2 * i] = a[i][j + 2];
+                t[2 * n1 + 2 * i + 1] = a[i][j + 3];
+                t[4 * n1 + 2 * i] = a[i][j + 4];
+                t[4 * n1 + 2 * i + 1] = a[i][j + 5];
+                t[6 * n1 + 2 * i] = a[i][j + 6];
+                t[6 * n1 + 2 * i + 1] = a[i][j + 7];
+            }
+            cdft(2 * n1, isgn, t, ip, w);
+            cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+            cdft(2 * n1, isgn, &t[4 * n1], ip, w);
+            cdft(2 * n1, isgn, &t[6 * n1], ip, w);
+            for (i = 0; i < n1; i++) {
+                a[i][j] = t[2 * i];
+                a[i][j + 1] = t[2 * i + 1];
+                a[i][j + 2] = t[2 * n1 + 2 * i];
+                a[i][j + 3] = t[2 * n1 + 2 * i + 1];
+                a[i][j + 4] = t[4 * n1 + 2 * i];
+                a[i][j + 5] = t[4 * n1 + 2 * i + 1];
+                a[i][j + 6] = t[6 * n1 + 2 * i];
+                a[i][j + 7] = t[6 * n1 + 2 * i + 1];
+            }
+        }
+    } else if (n2 == 4) {
+        for (i = 0; i < n1; i++) {
+            t[2 * i] = a[i][0];
+            t[2 * i + 1] = a[i][1];
+            t[2 * n1 + 2 * i] = a[i][2];
+            t[2 * n1 + 2 * i + 1] = a[i][3];
+        }
+        cdft(2 * n1, isgn, t, ip, w);
+        cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+        for (i = 0; i < n1; i++) {
+            a[i][0] = t[2 * i];
+            a[i][1] = t[2 * i + 1];
+            a[i][2] = t[2 * n1 + 2 * i];
+            a[i][3] = t[2 * n1 + 2 * i + 1];
+        }
+    } else if (n2 == 2) {
+        for (i = 0; i < n1; i++) {
+            t[2 * i] = a[i][0];
+            t[2 * i + 1] = a[i][1];
+        }
+        cdft(2 * n1, isgn, t, ip, w);
+        for (i = 0; i < n1; i++) {
+            a[i][0] = t[2 * i];
+            a[i][1] = t[2 * i + 1];
+        }
+    }
+}
+
+
+void rdft2d_sub(int n1, int n2, int isgn, double **a)
+{
+    int n1h, i, j;
+    double xi;
+    
+    n1h = n1 >> 1;
+    if (isgn < 0) {
+        for (i = 1; i < n1h; i++) {
+            j = n1 - i;
+            xi = a[i][0] - a[j][0];
+            a[i][0] += a[j][0];
+            a[j][0] = xi;
+            xi = a[j][1] - a[i][1];
+            a[i][1] += a[j][1];
+            a[j][1] = xi;
+        }
+    } else {
+        for (i = 1; i < n1h; i++) {
+            j = n1 - i;
+            a[j][0] = 0.5 * (a[i][0] - a[j][0]);
+            a[i][0] -= a[j][0];
+            a[j][1] = 0.5 * (a[i][1] + a[j][1]);
+            a[i][1] -= a[j][1];
+        }
+    }
+}
+
+
+void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a, 
+    double *t, int *ip, double *w)
+{
+    void ddct(int n, int isgn, double *a, int *ip, double *w);
+    void ddst(int n, int isgn, double *a, int *ip, double *w);
+    int i, j;
+    
+    if (n2 > 2) {
+        for (j = 0; j < n2; j += 4) {
+            for (i = 0; i < n1; i++) {
+                t[i] = a[i][j];
+                t[n1 + i] = a[i][j + 1];
+                t[2 * n1 + i] = a[i][j + 2];
+                t[3 * n1 + i] = a[i][j + 3];
+            }
+            if (ics == 0) {
+                ddct(n1, isgn, t, ip, w);
+                ddct(n1, isgn, &t[n1], ip, w);
+                ddct(n1, isgn, &t[2 * n1], ip, w);
+                ddct(n1, isgn, &t[3 * n1], ip, w);
+            } else {
+                ddst(n1, isgn, t, ip, w);
+                ddst(n1, isgn, &t[n1], ip, w);
+                ddst(n1, isgn, &t[2 * n1], ip, w);
+                ddst(n1, isgn, &t[3 * n1], ip, w);
+            }
+            for (i = 0; i < n1; i++) {
+                a[i][j] = t[i];
+                a[i][j + 1] = t[n1 + i];
+                a[i][j + 2] = t[2 * n1 + i];
+                a[i][j + 3] = t[3 * n1 + i];
+            }
+        }
+    } else if (n2 == 2) {
+        for (i = 0; i < n1; i++) {
+            t[i] = a[i][0];
+            t[n1 + i] = a[i][1];
+        }
+        if (ics == 0) {
+            ddct(n1, isgn, t, ip, w);
+            ddct(n1, isgn, &t[n1], ip, w);
+        } else {
+            ddst(n1, isgn, t, ip, w);
+            ddst(n1, isgn, &t[n1], ip, w);
+        }
+        for (i = 0; i < n1; i++) {
+            a[i][0] = t[i];
+            a[i][1] = t[n1 + i];
+        }
+    }
+}
+
+
+#ifdef USE_FFT2D_THREADS
+struct fft2d_arg_st {
+    int nthread;
+    int n0;
+    int n1;
+    int n2;
+    int ic;
+    int isgn;
+    double **a;
+    double *t;
+    int *ip;
+    double *w;
+};
+typedef struct fft2d_arg_st fft2d_arg_t;
+
+
+void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a, 
+    int *ip, double *w)
+{
+    void *xdft2d0_th(void *p);
+    fft2d_thread_t th[FFT2D_MAX_THREADS];
+    fft2d_arg_t ag[FFT2D_MAX_THREADS];
+    int nthread, i;
+    
+    nthread = FFT2D_MAX_THREADS;
+    if (nthread > n1) {
+        nthread = n1;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].nthread = nthread;
+        ag[i].n0 = i;
+        ag[i].n1 = n1;
+        ag[i].n2 = n2;
+        ag[i].ic = icr;
+        ag[i].isgn = isgn;
+        ag[i].a = a;
+        ag[i].ip = ip;
+        ag[i].w = w;
+        fft2d_thread_create(&th[i], xdft2d0_th, &ag[i]);
+    }
+    for (i = 0; i < nthread; i++) {
+        fft2d_thread_wait(th[i]);
+    }
+}
+
+
+void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t, 
+    int *ip, double *w)
+{
+    void *cdft2d_th(void *p);
+    fft2d_thread_t th[FFT2D_MAX_THREADS];
+    fft2d_arg_t ag[FFT2D_MAX_THREADS];
+    int nthread, nt, i;
+    
+    nthread = FFT2D_MAX_THREADS;
+    nt = 8 * n1;
+    if (n2 == 4 * FFT2D_MAX_THREADS) {
+        nt >>= 1;
+    } else if (n2 < 4 * FFT2D_MAX_THREADS) {
+        nthread = n2 >> 1;
+        nt >>= 2;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].nthread = nthread;
+        ag[i].n0 = i;
+        ag[i].n1 = n1;
+        ag[i].n2 = n2;
+        ag[i].isgn = isgn;
+        ag[i].a = a;
+        ag[i].t = &t[nt * i];
+        ag[i].ip = ip;
+        ag[i].w = w;
+        fft2d_thread_create(&th[i], cdft2d_th, &ag[i]);
+    }
+    for (i = 0; i < nthread; i++) {
+        fft2d_thread_wait(th[i]);
+    }
+}
+
+
+void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a, 
+    int *ip, double *w)
+{
+    void *ddxt2d0_th(void *p);
+    fft2d_thread_t th[FFT2D_MAX_THREADS];
+    fft2d_arg_t ag[FFT2D_MAX_THREADS];
+    int nthread, i;
+    
+    nthread = FFT2D_MAX_THREADS;
+    if (nthread > n1) {
+        nthread = n1;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].nthread = nthread;
+        ag[i].n0 = i;
+        ag[i].n1 = n1;
+        ag[i].n2 = n2;
+        ag[i].ic = ics;
+        ag[i].isgn = isgn;
+        ag[i].a = a;
+        ag[i].ip = ip;
+        ag[i].w = w;
+        fft2d_thread_create(&th[i], ddxt2d0_th, &ag[i]);
+    }
+    for (i = 0; i < nthread; i++) {
+        fft2d_thread_wait(th[i]);
+    }
+}
+
+
+void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a, 
+    double *t, int *ip, double *w)
+{
+    void *ddxt2d_th(void *p);
+    fft2d_thread_t th[FFT2D_MAX_THREADS];
+    fft2d_arg_t ag[FFT2D_MAX_THREADS];
+    int nthread, nt, i;
+    
+    nthread = FFT2D_MAX_THREADS;
+    nt = 4 * n1;
+    if (n2 == 2 * FFT2D_MAX_THREADS) {
+        nt >>= 1;
+    } else if (n2 < 2 * FFT2D_MAX_THREADS) {
+        nthread = n2;
+        nt >>= 2;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].nthread = nthread;
+        ag[i].n0 = i;
+        ag[i].n1 = n1;
+        ag[i].n2 = n2;
+        ag[i].ic = ics;
+        ag[i].isgn = isgn;
+        ag[i].a = a;
+        ag[i].t = &t[nt * i];
+        ag[i].ip = ip;
+        ag[i].w = w;
+        fft2d_thread_create(&th[i], ddxt2d_th, &ag[i]);
+    }
+    for (i = 0; i < nthread; i++) {
+        fft2d_thread_wait(th[i]);
+    }
+}
+
+
+void *xdft2d0_th(void *p)
+{
+    void cdft(int n, int isgn, double *a, int *ip, double *w);
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    int nthread, n0, n1, n2, icr, isgn, *ip, i;
+    double **a, *w;
+    
+    nthread = ((fft2d_arg_t *) p)->nthread;
+    n0 = ((fft2d_arg_t *) p)->n0;
+    n1 = ((fft2d_arg_t *) p)->n1;
+    n2 = ((fft2d_arg_t *) p)->n2;
+    icr = ((fft2d_arg_t *) p)->ic;
+    isgn = ((fft2d_arg_t *) p)->isgn;
+    a = ((fft2d_arg_t *) p)->a;
+    ip = ((fft2d_arg_t *) p)->ip;
+    w = ((fft2d_arg_t *) p)->w;
+    if (icr == 0) {
+        for (i = n0; i < n1; i += nthread) {
+            cdft(n2, isgn, a[i], ip, w);
+        }
+    } else {
+        for (i = n0; i < n1; i += nthread) {
+            rdft(n2, isgn, a[i], ip, w);
+        }
+    }
+    return (void *) 0;
+}
+
+
+void *cdft2d_th(void *p)
+{
+    void cdft(int n, int isgn, double *a, int *ip, double *w);
+    int nthread, n0, n1, n2, isgn, *ip, i, j;
+    double **a, *t, *w;
+    
+    nthread = ((fft2d_arg_t *) p)->nthread;
+    n0 = ((fft2d_arg_t *) p)->n0;
+    n1 = ((fft2d_arg_t *) p)->n1;
+    n2 = ((fft2d_arg_t *) p)->n2;
+    isgn = ((fft2d_arg_t *) p)->isgn;
+    a = ((fft2d_arg_t *) p)->a;
+    t = ((fft2d_arg_t *) p)->t;
+    ip = ((fft2d_arg_t *) p)->ip;
+    w = ((fft2d_arg_t *) p)->w;
+    if (n2 > 4 * nthread) {
+        for (j = 8 * n0; j < n2; j += 8 * nthread) {
+            for (i = 0; i < n1; i++) {
+                t[2 * i] = a[i][j];
+                t[2 * i + 1] = a[i][j + 1];
+                t[2 * n1 + 2 * i] = a[i][j + 2];
+                t[2 * n1 + 2 * i + 1] = a[i][j + 3];
+                t[4 * n1 + 2 * i] = a[i][j + 4];
+                t[4 * n1 + 2 * i + 1] = a[i][j + 5];
+                t[6 * n1 + 2 * i] = a[i][j + 6];
+                t[6 * n1 + 2 * i + 1] = a[i][j + 7];
+            }
+            cdft(2 * n1, isgn, t, ip, w);
+            cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+            cdft(2 * n1, isgn, &t[4 * n1], ip, w);
+            cdft(2 * n1, isgn, &t[6 * n1], ip, w);
+            for (i = 0; i < n1; i++) {
+                a[i][j] = t[2 * i];
+                a[i][j + 1] = t[2 * i + 1];
+                a[i][j + 2] = t[2 * n1 + 2 * i];
+                a[i][j + 3] = t[2 * n1 + 2 * i + 1];
+                a[i][j + 4] = t[4 * n1 + 2 * i];
+                a[i][j + 5] = t[4 * n1 + 2 * i + 1];
+                a[i][j + 6] = t[6 * n1 + 2 * i];
+                a[i][j + 7] = t[6 * n1 + 2 * i + 1];
+            }
+        }
+    } else if (n2 == 4 * nthread) {
+        for (i = 0; i < n1; i++) {
+            t[2 * i] = a[i][4 * n0];
+            t[2 * i + 1] = a[i][4 * n0 + 1];
+            t[2 * n1 + 2 * i] = a[i][4 * n0 + 2];
+            t[2 * n1 + 2 * i + 1] = a[i][4 * n0 + 3];
+        }
+        cdft(2 * n1, isgn, t, ip, w);
+        cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+        for (i = 0; i < n1; i++) {
+            a[i][4 * n0] = t[2 * i];
+            a[i][4 * n0 + 1] = t[2 * i + 1];
+            a[i][4 * n0 + 2] = t[2 * n1 + 2 * i];
+            a[i][4 * n0 + 3] = t[2 * n1 + 2 * i + 1];
+        }
+    } else if (n2 == 2 * nthread) {
+        for (i = 0; i < n1; i++) {
+            t[2 * i] = a[i][2 * n0];
+            t[2 * i + 1] = a[i][2 * n0 + 1];
+        }
+        cdft(2 * n1, isgn, t, ip, w);
+        for (i = 0; i < n1; i++) {
+            a[i][2 * n0] = t[2 * i];
+            a[i][2 * n0 + 1] = t[2 * i + 1];
+        }
+    }
+    return (void *) 0;
+}
+
+
+void *ddxt2d0_th(void *p)
+{
+    void ddct(int n, int isgn, double *a, int *ip, double *w);
+    void ddst(int n, int isgn, double *a, int *ip, double *w);
+    int nthread, n0, n1, n2, ics, isgn, *ip, i;
+    double **a, *w;
+    
+    nthread = ((fft2d_arg_t *) p)->nthread;
+    n0 = ((fft2d_arg_t *) p)->n0;
+    n1 = ((fft2d_arg_t *) p)->n1;
+    n2 = ((fft2d_arg_t *) p)->n2;
+    ics = ((fft2d_arg_t *) p)->ic;
+    isgn = ((fft2d_arg_t *) p)->isgn;
+    a = ((fft2d_arg_t *) p)->a;
+    ip = ((fft2d_arg_t *) p)->ip;
+    w = ((fft2d_arg_t *) p)->w;
+    if (ics == 0) {
+        for (i = n0; i < n1; i += nthread) {
+            ddct(n2, isgn, a[i], ip, w);
+        }
+    } else {
+        for (i = n0; i < n1; i += nthread) {
+            ddst(n2, isgn, a[i], ip, w);
+        }
+    }
+    return (void *) 0;
+}
+
+
+void *ddxt2d_th(void *p)
+{
+    void ddct(int n, int isgn, double *a, int *ip, double *w);
+    void ddst(int n, int isgn, double *a, int *ip, double *w);
+    int nthread, n0, n1, n2, ics, isgn, *ip, i, j;
+    double **a, *t, *w;
+    
+    nthread = ((fft2d_arg_t *) p)->nthread;
+    n0 = ((fft2d_arg_t *) p)->n0;
+    n1 = ((fft2d_arg_t *) p)->n1;
+    n2 = ((fft2d_arg_t *) p)->n2;
+    ics = ((fft2d_arg_t *) p)->ic;
+    isgn = ((fft2d_arg_t *) p)->isgn;
+    a = ((fft2d_arg_t *) p)->a;
+    t = ((fft2d_arg_t *) p)->t;
+    ip = ((fft2d_arg_t *) p)->ip;
+    w = ((fft2d_arg_t *) p)->w;
+    if (n2 > 2 * nthread) {
+        for (j = 4 * n0; j < n2; j += 4 * nthread) {
+            for (i = 0; i < n1; i++) {
+                t[i] = a[i][j];
+                t[n1 + i] = a[i][j + 1];
+                t[2 * n1 + i] = a[i][j + 2];
+                t[3 * n1 + i] = a[i][j + 3];
+            }
+            if (ics == 0) {
+                ddct(n1, isgn, t, ip, w);
+                ddct(n1, isgn, &t[n1], ip, w);
+                ddct(n1, isgn, &t[2 * n1], ip, w);
+                ddct(n1, isgn, &t[3 * n1], ip, w);
+            } else {
+                ddst(n1, isgn, t, ip, w);
+                ddst(n1, isgn, &t[n1], ip, w);
+                ddst(n1, isgn, &t[2 * n1], ip, w);
+                ddst(n1, isgn, &t[3 * n1], ip, w);
+            }
+            for (i = 0; i < n1; i++) {
+                a[i][j] = t[i];
+                a[i][j + 1] = t[n1 + i];
+                a[i][j + 2] = t[2 * n1 + i];
+                a[i][j + 3] = t[3 * n1 + i];
+            }
+        }
+    } else if (n2 == 2 * nthread) {
+        for (i = 0; i < n1; i++) {
+            t[i] = a[i][2 * n0];
+            t[n1 + i] = a[i][2 * n0 + 1];
+        }
+        if (ics == 0) {
+            ddct(n1, isgn, t, ip, w);
+            ddct(n1, isgn, &t[n1], ip, w);
+        } else {
+            ddst(n1, isgn, t, ip, w);
+            ddst(n1, isgn, &t[n1], ip, w);
+        }
+        for (i = 0; i < n1; i++) {
+            a[i][2 * n0] = t[i];
+            a[i][2 * n0 + 1] = t[n1 + i];
+        }
+    } else if (n2 == nthread) {
+        for (i = 0; i < n1; i++) {
+            t[i] = a[i][n0];
+        }
+        if (ics == 0) {
+            ddct(n1, isgn, t, ip, w);
+        } else {
+            ddst(n1, isgn, t, ip, w);
+        }
+        for (i = 0; i < n1; i++) {
+            a[i][n0] = t[i];
+        }
+    }
+    return (void *) 0;
+}
+#endif /* USE_FFT2D_THREADS */
+
diff --git a/fftsg2d.f b/fftsg2d.f
new file mode 100644
index 0000000..49a9d6c
--- /dev/null
+++ b/fftsg2d.f
@@ -0,0 +1,562 @@
+! Fast Fourier/Cosine/Sine Transform
+!     dimension   :two
+!     data length :power of 2
+!     decimation  :frequency
+!     radix       :split-radix, row-column
+!     data        :inplace
+!     table       :use
+! subroutines
+!     cdft2d: Complex Discrete Fourier Transform
+!     rdft2d: Real Discrete Fourier Transform
+!     ddct2d: Discrete Cosine Transform
+!     ddst2d: Discrete Sine Transform
+! necessary package
+!     fftsg.f  : 1D-FFT package
+!
+!
+! -------- Complex DFT (Discrete Fourier Transform) --------
+!     [definition]
+!         <case1>
+!             X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) * 
+!                            exp(2*pi*i*j1*k1/n1) * 
+!                            exp(2*pi*i*j2*k2/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!         <case2>
+!             X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) * 
+!                            exp(-2*pi*i*j1*k1/n1) * 
+!                            exp(-2*pi*i*j2*k2/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!         (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call cdft2d(n1max, 2*n1, n2, 1, a, t, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call cdft2d(n1max, 2*n1, n2, -1, a, t, ip, w)
+!     [parameters]
+!         n1max  :row size of the 2D array (integer)
+!         2*n1   :data length (integer)
+!                 n1 >= 1, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 1, n2 = power of 2
+!         a(0:2*n1-1,0:n2-1)
+!                :input/output data (real*8)
+!                 input data
+!                     a(2*j1,j2) = Re(x(j1,j2)), 
+!                     a(2*j1+1,j2) = Im(x(j1,j2)), 
+!                     0<=j1<n1, 0<=j2<n2
+!                 output data
+!                     a(2*k1,k2) = Re(X(k1,k2)), 
+!                     a(2*k1+1,k2) = Im(X(k1,k2)), 
+!                     0<=k1<n1, 0<=k2<n2
+!         t(0:8*n2-1)
+!                :work area (real*8)
+!                 length of t >= 8*n2
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1, n2))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1/2, n2/2)
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call cdft2d(n1max, 2*n1, n2, -1, a, t, ip, w)
+!         is 
+!             call cdft2d(n1max, 2*n1, n2, 1, a, t, ip, w)
+!             do j2 = 0, n2 - 1
+!                 do j1 = 0, 2 * n1 - 1
+!                     a(j1, j2) = a(j1, j2) * (1.0d0 / n1 / n2)
+!                 end do
+!             end do
+!         .
+!
+!
+! -------- Real DFT / Inverse of Real DFT --------
+!     [definition]
+!         <case1> RDFT
+!             R(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
+!                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!             I(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
+!                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!         <case2> IRDFT (excluding scale)
+!             a(k1,k2) = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
+!                            (R(j1,j2) * 
+!                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) + 
+!                            I(j1,j2) * 
+!                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)), 
+!                            0<=k1<n1, 0<=k2<n2
+!         (notes: R(n1-k1,n2-k2) = R(k1,k2), 
+!                 I(n1-k1,n2-k2) = -I(k1,k2), 
+!                 R(n1-k1,0) = R(k1,0), 
+!                 I(n1-k1,0) = -I(k1,0), 
+!                 R(0,n2-k2) = R(0,k2), 
+!                 I(0,n2-k2) = -I(0,k2), 
+!                 0<k1<n1, 0<k2<n2)
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call rdft2d(n1max, n1, n2, 1, a, t, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call rdft2d(n1max, n1, n2, -1, a, t, ip, w)
+!     [parameters]
+!         n1max  :row size of the 2D array (integer)
+!         n1     :data length (integer)
+!                 n1 >= 2, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 2, n2 = power of 2
+!         a(0:n1-1,0:n2-1)
+!                :input/output data (real*8)
+!                 <case1>
+!                     output data
+!                         a(2*k1,k2) = R(k1,k2) = R(n1-k1,n2-k2), 
+!                         a(2*k1+1,k2) = I(k1,k2) = -I(n1-k1,n2-k2), 
+!                             0<k1<n1/2, 0<k2<n2, 
+!                         a(2*k1,0) = R(k1,0) = R(n1-k1,0), 
+!                         a(2*k1+1,0) = I(k1,0) = -I(n1-k1,0), 
+!                             0<k1<n1/2, 
+!                         a(0,k2) = R(0,k2) = R(0,n2-k2), 
+!                         a(1,k2) = I(0,k2) = -I(0,n2-k2), 
+!                         a(1,n2-k2) = R(n1/2,k2) = R(n1/2,n2-k2), 
+!                         a(0,n2-k2) = -I(n1/2,k2) = I(n1/2,n2-k2), 
+!                             0<k2<n2/2, 
+!                         a(0,0) = R(0,0), 
+!                         a(1,0) = R(n1/2,0), 
+!                         a(0,n2/2) = R(0,n2/2), 
+!                         a(1,n2/2) = R(n1/2,n2/2)
+!                 <case2>
+!                     input data
+!                         a(2*j1,j2) = R(j1,j2) = R(n1-j1,n2-j2), 
+!                         a(2*j1+1,j2) = I(j1,j2) = -I(n1-j1,n2-j2), 
+!                             0<j1<n1/2, 0<j2<n2, 
+!                         a(2*j1,0) = R(j1,0) = R(n1-j1,0), 
+!                         a(2*j1+1,0) = I(j1,0) = -I(n1-j1,0), 
+!                             0<j1<n1/2, 
+!                         a(0,j2) = R(0,j2) = R(0,n2-j2), 
+!                         a(1,j2) = I(0,j2) = -I(0,n2-j2), 
+!                         a(1,n2-j2) = R(n1/2,j2) = R(n1/2,n2-j2), 
+!                         a(0,n2-j2) = -I(n1/2,j2) = I(n1/2,n2-j2), 
+!                             0<j2<n2/2, 
+!                         a(0,0) = R(0,0), 
+!                         a(1,0) = R(n1/2,0), 
+!                         a(0,n2/2) = R(0,n2/2), 
+!                         a(1,n2/2) = R(n1/2,n2/2)
+!                 ---- output ordering ----
+!                     call rdft2d(n1max, n1, n2, 1, a, t, ip, w)
+!                     call rdft2dsort(n1max, n1, n2, 1, a)
+!                     ! stored data is a(0:n1-1,0:n2+1):
+!                     ! a(2*k1,k2) = R(k1,k2), 
+!                     ! a(2*k1+1,k2) = I(k1,k2), 
+!                     ! 0<=k1<=n1/2, 0<=k2<n2.
+!                     ! the stored data is larger than the input data!
+!                 ---- input ordering ----
+!                     call rdft2dsort(n1max, n1, n2, -1, a)
+!                     call rdft2d(n1max, n1, n2, -1, a, t, ip, w)
+!         t(0:8*n2-1)
+!                :work area (real*8)
+!                 length of t >= 8*n2
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1/2, n2))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1/4, n2/2) + n1/4
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call rdft2d(n1max, n1, n2, 1, a, t, ip, w)
+!         is 
+!             call rdft2d(n1max, n1, n2, -1, a, t, ip, w)
+!             do j2 = 0, n2 - 1
+!                 do j1 = 0, n1 - 1
+!                     a(j1, j2) = a(j1, j2) * (2.0d0 / n1 / n2)
+!                 end do
+!             end do
+!         .
+!
+!
+! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+!     [definition]
+!         <case1> IDCT (excluding scale)
+!             C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
+!                            cos(pi*j1*(k1+1/2)/n1) * 
+!                            cos(pi*j2*(k2+1/2)/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!         <case2> DCT
+!             C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
+!                            cos(pi*(j1+1/2)*k1/n1) * 
+!                            cos(pi*(j2+1/2)*k2/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
+!     [parameters]
+!         n1max  :row size of the 2D array (integer)
+!         n1     :data length (integer)
+!                 n1 >= 2, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 2, n2 = power of 2
+!         a(0:n1-1,0:n2-1)
+!                :input/output data (real*8)
+!                 output data
+!                     a(k1,k2) = C(k1,k2), 0<=k1<n1, 0<=k2<n2
+!         t(0:4*n2-1)
+!                :work area (real*8)
+!                 length of t >= 4*n2
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1/2, n2/2))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1*3/2, n2*3/2)
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
+!         is 
+!             do j1 = 0, n1 - 1
+!                 a(j1, 0) = a(j1, 0) * 0.5d0
+!             end do
+!             do j2 = 0, n2 - 1
+!                 a(0, j2) = a(0, j2) * 0.5d0
+!             end do
+!             call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
+!             do j2 = 0, n2 - 1
+!                 do j1 = 0, n1 - 1
+!                     a(j1, j2) = a(j1, j2) * (4.0d0 / n1 / n2)
+!                 end do
+!             end do
+!         .
+!
+!
+! -------- DST (Discrete Sine Transform) / Inverse of DST --------
+!     [definition]
+!         <case1> IDST (excluding scale)
+!             S(k1,k2) = sum_j1=1^n1 sum_j2=1^n2 A(j1,j2) * 
+!                            sin(pi*j1*(k1+1/2)/n1) * 
+!                            sin(pi*j2*(k2+1/2)/n2), 
+!                            0<=k1<n1, 0<=k2<n2
+!         <case2> DST
+!             S(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) * 
+!                            sin(pi*(j1+1/2)*k1/n1) * 
+!                            sin(pi*(j2+1/2)*k2/n2), 
+!                            0<k1<=n1, 0<k2<=n2
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
+!     [parameters]
+!         n1max  :row size of the 2D array (integer)
+!         n1     :data length (integer)
+!                 n1 >= 2, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 2, n2 = power of 2
+!         a(0:n1-1,0:n2-1)
+!                :input/output data (real*8)
+!                 <case1>
+!                     input data
+!                         a(j1,j2) = A(j1,j2), 0<j1<n1, 0<j2<n2, 
+!                         a(j1,0) = A(j1,n2), 0<j1<n1, 
+!                         a(0,j2) = A(n1,j2), 0<j2<n2, 
+!                         a(0,0) = A(n1,n2)
+!                         (i.e. A(j1,j2) = a(mod(j1,n1),mod(j2,n2)))
+!                     output data
+!                         a(k1,k2) = S(k1,k2), 0<=k1<n1, 0<=k2<n2
+!                 <case2>
+!                     output data
+!                         a(k1,k2) = S(k1,k2), 0<k1<n1, 0<k2<n2, 
+!                         a(k1,0) = S(k1,n2), 0<k1<n1, 
+!                         a(0,k2) = S(n1,k2), 0<k2<n2, 
+!                         a(0,0) = S(n1,n2)
+!                         (i.e. S(k1,k2) = a(mod(k1,n1),mod(k2,n2)))
+!         t(0:4*n2-1)
+!                :work area (real*8)
+!                 length of t >= 4*n2
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1/2, n2/2))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1*3/2, n2*3/2)
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
+!         is 
+!             do j1 = 0, n1 - 1
+!                 a(j1, 0) = a(j1, 0) * 0.5d0
+!             end do
+!             do j2 = 0, n2 - 1
+!                 a(0, j2) = a(0, j2) * 0.5d0
+!             end do
+!             call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
+!             do j2 = 0, n2 - 1
+!                 do j1 = 0, n1 - 1
+!                     a(j1, j2) = a(j1, j2) * (4.0d0 / n1 / n2)
+!                 end do
+!             end do
+!         .
+!
+!
+      subroutine cdft2d(n1max, n1, n2, isgn, a, t, ip, w)
+      integer n1max, n1, n2, isgn, ip(0 : *), n, j
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 8 * n2 - 1), 
+     &    w(0 : *)
+      n = max(n1, 2 * n2)
+      if (n .gt. 4 * ip(0)) then
+          call makewt(n / 4, ip, w)
+      end if
+      do j = 0, n2 - 1
+          call cdft(n1, isgn, a(0, j), ip, w)
+      end do
+      call cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
+      end
+!
+      subroutine rdft2d(n1max, n1, n2, isgn, a, t, ip, w)
+      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, j
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 8 * n2 - 1), 
+     &    w(0 : *)
+      n = max(n1, 2 * n2)
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n1 .gt. 4 * nc) then
+          nc = n1 / 4
+          call makect(nc, ip, w(nw))
+      end if
+      if (isgn .lt. 0) then
+          call rdft2d_sub(n1max, n1, n2, isgn, a)
+          call cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
+      end if
+      do j = 0, n2 - 1
+          call rdft(n1, isgn, a(0, j), ip, w)
+      end do
+      if (isgn .ge. 0) then
+          call cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
+          call rdft2d_sub(n1max, n1, n2, isgn, a)
+      end if
+      end
+!
+      subroutine rdft2dsort(n1max, n1, n2, isgn, a)
+      integer n1max, n1, n2, isgn, n2h, j
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), x, y
+      n2h = n2 / 2
+      if (isgn .lt. 0) then
+          do j = n2h + 1, n2 - 1
+              a(0, j) = a(n1 + 1, j)
+              a(1, j) = a(n1, j)
+          end do
+          a(1, 0) = a(n1, 0)
+          a(1, n2h) = a(n1, n2h)
+      else
+          do j = n2h + 1, n2 - 1
+              y = a(0, j)
+              x = a(1, j)
+              a(n1, j) = x
+              a(n1 + 1, j) = y
+              a(n1, n2 - j) = x
+              a(n1 + 1, n2 - j) = -y
+              a(0, j) = a(0, n2 - j)
+              a(1, j) = -a(1, n2 - j)
+          end do
+          a(n1, 0) = a(1, 0)
+          a(n1 + 1, 0) = 0
+          a(1, 0) = 0
+          a(n1, n2h) = a(1, n2h)
+          a(n1 + 1, n2h) = 0
+          a(1, n2h) = 0
+      end if
+      end
+!
+      subroutine ddct2d(n1max, n1, n2, isgn, a, t, ip, w)
+      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, j
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 4 * n2 - 1), 
+     &    w(0 : *)
+      n = max(n1, n2)
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. nc) then
+          nc = n
+          call makect(nc, ip, w(nw))
+      end if
+      do j = 0, n2 - 1
+          call ddct(n1, isgn, a(0, j), ip, w)
+      end do
+      call ddxt2d_sub(n1max, n1, n2, 0, isgn, a, t, ip, w)
+      end
+!
+      subroutine ddst2d(n1max, n1, n2, isgn, a, t, ip, w)
+      integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, j
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 4 * n2 - 1), 
+     &    w(0 : *)
+      n = max(n1, n2)
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. nc) then
+          nc = n
+          call makect(nc, ip, w(nw))
+      end if
+      do j = 0, n2 - 1
+          call ddst(n1, isgn, a(0, j), ip, w)
+      end do
+      call ddxt2d_sub(n1max, n1, n2, 1, isgn, a, t, ip, w)
+      end
+!
+! -------- child routines --------
+!
+      subroutine cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
+      integer n1max, n1, n2, isgn, ip(0 : *), i, j
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 8 * n2 - 1), 
+     &    w(0 : *)
+      if (n1 .gt. 4) then
+          do i = 0, n1 - 8, 8
+              do j = 0, n2 - 1
+                  t(2 * j) = a(i, j)
+                  t(2 * j + 1) = a(i + 1, j)
+                  t(2 * n2 + 2 * j) = a(i + 2, j)
+                  t(2 * n2 + 2 * j + 1) = a(i + 3, j)
+                  t(4 * n2 + 2 * j) = a(i + 4, j)
+                  t(4 * n2 + 2 * j + 1) = a(i + 5, j)
+                  t(6 * n2 + 2 * j) = a(i + 6, j)
+                  t(6 * n2 + 2 * j + 1) = a(i + 7, j)
+              end do
+              call cdft(2 * n2, isgn, t, ip, w)
+              call cdft(2 * n2, isgn, t(2 * n2), ip, w)
+              call cdft(2 * n2, isgn, t(4 * n2), ip, w)
+              call cdft(2 * n2, isgn, t(6 * n2), ip, w)
+              do j = 0, n2 - 1
+                  a(i, j) = t(2 * j)
+                  a(i + 1, j) = t(2 * j + 1)
+                  a(i + 2, j) = t(2 * n2 + 2 * j)
+                  a(i + 3, j) = t(2 * n2 + 2 * j + 1)
+                  a(i + 4, j) = t(4 * n2 + 2 * j)
+                  a(i + 5, j) = t(4 * n2 + 2 * j + 1)
+                  a(i + 6, j) = t(6 * n2 + 2 * j)
+                  a(i + 7, j) = t(6 * n2 + 2 * j + 1)
+              end do
+          end do
+      else if (n1 .eq. 4) then
+          do j = 0, n2 - 1
+              t(2 * j) = a(0, j)
+              t(2 * j + 1) = a(1, j)
+              t(2 * n2 + 2 * j) = a(2, j)
+              t(2 * n2 + 2 * j + 1) = a(3, j)
+          end do
+          call cdft(2 * n2, isgn, t, ip, w)
+          call cdft(2 * n2, isgn, t(2 * n2), ip, w)
+          do j = 0, n2 - 1
+              a(0, j) = t(2 * j)
+              a(1, j) = t(2 * j + 1)
+              a(2, j) = t(2 * n2 + 2 * j)
+              a(3, j) = t(2 * n2 + 2 * j + 1)
+          end do
+      else if (n1 .eq. 2) then
+          do j = 0, n2 - 1
+              t(2 * j) = a(0, j)
+              t(2 * j + 1) = a(1, j)
+          end do
+          call cdft(2 * n2, isgn, t, ip, w)
+          do j = 0, n2 - 1
+              a(0, j) = t(2 * j)
+              a(1, j) = t(2 * j + 1)
+          end do
+      end if
+      end
+!
+      subroutine rdft2d_sub(n1max, n1, n2, isgn, a)
+      integer n1max, n1, n2, isgn, n2h, i, j
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), xi
+      n2h = n2 / 2
+      if (isgn .lt. 0) then
+          do i = 1, n2h - 1
+              j = n2 - i
+              xi = a(0, i) - a(0, j)
+              a(0, i) = a(0, i) + a(0, j)
+              a(0, j) = xi
+              xi = a(1, j) - a(1, i)
+              a(1, i) = a(1, i) + a(1, j)
+              a(1, j) = xi
+          end do
+      else
+          do i = 1, n2h - 1
+              j = n2 - i
+              a(0, j) = 0.5d0 * (a(0, i) - a(0, j))
+              a(0, i) = a(0, i) - a(0, j)
+              a(1, j) = 0.5d0 * (a(1, i) + a(1, j))
+              a(1, i) = a(1, i) - a(1, j)
+          end do
+      end if
+      end
+!
+      subroutine ddxt2d_sub(n1max, n1, n2, ics, isgn, a, t, 
+     &    ip, w)
+      integer n1max, n1, n2, ics, isgn, ip(0 : *), i, j
+      real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 4 * n2 - 1), 
+     &    w(0 : *)
+      if (n1 .gt. 2) then
+          do i = 0, n1 - 4, 4
+              do j = 0, n2 - 1
+                  t(j) = a(i, j)
+                  t(n2 + j) = a(i + 1, j)
+                  t(2 * n2 + j) = a(i + 2, j)
+                  t(3 * n2 + j) = a(i + 3, j)
+              end do
+              if (ics .eq. 0) then
+                  call ddct(n2, isgn, t, ip, w)
+                  call ddct(n2, isgn, t(n2), ip, w)
+                  call ddct(n2, isgn, t(2 * n2), ip, w)
+                  call ddct(n2, isgn, t(3 * n2), ip, w)
+              else
+                  call ddst(n2, isgn, t, ip, w)
+                  call ddst(n2, isgn, t(n2), ip, w)
+                  call ddst(n2, isgn, t(2 * n2), ip, w)
+                  call ddst(n2, isgn, t(3 * n2), ip, w)
+              end if
+              do j = 0, n2 - 1
+                  a(i, j) = t(j)
+                  a(i + 1, j) = t(n2 + j)
+                  a(i + 2, j) = t(2 * n2 + j)
+                  a(i + 3, j) = t(3 * n2 + j)
+              end do
+          end do
+      else if (n1 .eq. 2) then
+          do j = 0, n2 - 1
+              t(j) = a(0, j)
+              t(n2 + j) = a(1, j)
+          end do
+          if (ics .eq. 0) then
+              call ddct(n2, isgn, t, ip, w)
+              call ddct(n2, isgn, t(n2), ip, w)
+          else
+              call ddst(n2, isgn, t, ip, w)
+              call ddst(n2, isgn, t(n2), ip, w)
+          end if
+          do j = 0, n2 - 1
+              a(0, j) = t(j)
+              a(1, j) = t(n2 + j)
+          end do
+      end if
+      end
+!
diff --git a/fftsg3d.c b/fftsg3d.c
new file mode 100644
index 0000000..62f7daa
--- /dev/null
+++ b/fftsg3d.c
@@ -0,0 +1,1695 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :three
+    data length :power of 2
+    decimation  :frequency
+    radix       :split-radix, row-column
+    data        :inplace
+    table       :use
+functions
+    cdft3d: Complex Discrete Fourier Transform
+    rdft3d: Real Discrete Fourier Transform
+    ddct3d: Discrete Cosine Transform
+    ddst3d: Discrete Sine Transform
+function prototypes
+    void cdft3d(int, int, int, int, double ***, double *, int *, double *);
+    void rdft3d(int, int, int, int, double ***, double *, int *, double *);
+    void rdft3dsort(int, int, int, int, double ***);
+    void ddct3d(int, int, int, int, double ***, double *, int *, double *);
+    void ddst3d(int, int, int, int, double ***, double *, int *, double *);
+necessary package
+    fftsg.c  : 1D-FFT package
+macro definitions
+    USE_FFT3D_PTHREADS : default=not defined
+        FFT3D_MAX_THREADS     : must be 2^N, default=4
+        FFT3D_THREADS_BEGIN_N : default=65536
+    USE_FFT3D_WINTHREADS : default=not defined
+        FFT3D_MAX_THREADS     : must be 2^N, default=4
+        FFT3D_THREADS_BEGIN_N : default=131072
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+                                x[j1][j2][j3] * 
+                                exp(2*pi*i*j1*k1/n1) * 
+                                exp(2*pi*i*j2*k2/n2) * 
+                                exp(2*pi*i*j3*k3/n3), 
+                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+        <case2>
+            X[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+                                x[j1][j2][j3] * 
+                                exp(-2*pi*i*j1*k1/n1) * 
+                                exp(-2*pi*i*j2*k2/n2) * 
+                                exp(-2*pi*i*j3*k3/n3), 
+                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft3d(n1, n2, 2*n3, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft3d(n1, n2, 2*n3, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 1, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 1, n2 = power of 2
+        2*n3   :data length (int)
+                n3 >= 1, n3 = power of 2
+        a[0...n1-1][0...n2-1][0...2*n3-1]
+               :input/output data (double ***)
+                input data
+                    a[j1][j2][2*j3] = Re(x[j1][j2][j3]), 
+                    a[j1][j2][2*j3+1] = Im(x[j1][j2][j3]), 
+                    0<=j1<n1, 0<=j2<n2, 0<=j3<n3
+                output data
+                    a[k1][k2][2*k3] = Re(X[k1][k2][k3]), 
+                    a[k1][k2][2*k3+1] = Im(X[k1][k2][k3]), 
+                    0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+        t[0...*]
+               :work area (double *)
+                length of t >= max(8*n1, 8*n2),        if single thread, 
+                length of t >= max(8*n1, 8*n2)*FFT3D_MAX_THREADS, 
+                                                       if multi threads, 
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1, n2, n3))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1/2, n2/2, n3/2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            cdft3d(n1, n2, 2*n3, -1, a, t, ip, w);
+        is 
+            cdft3d(n1, n2, 2*n3, 1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    for (j3 = 0; j3 <= 2 * n3 - 1; j3++) {
+                        a[j1][j2][j3] *= 1.0 / n1 / n2 / n3;
+                    }
+                }
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+                                a[j1][j2][j3] * 
+                                cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
+                                    2*pi*j3*k3/n3), 
+                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+            I[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+                                a[j1][j2][j3] * 
+                                sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
+                                    2*pi*j3*k3/n3), 
+                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+        <case2> IRDFT (excluding scale)
+            a[k1][k2][k3] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+                                (R[j1][j2][j3] * 
+                                cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
+                                    2*pi*j3*k3/n3) + 
+                                I[j1][j2][j3] * 
+                                sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
+                                    2*pi*j3*k3/n3)), 
+                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+        (notes: R[(n1-k1)%n1][(n2-k2)%n2][(n3-k3)%n3] = R[k1][k2][k3], 
+                I[(n1-k1)%n1][(n2-k2)%n2][(n3-k3)%n3] = -I[k1][k2][k3], 
+                0<=k1<n1, 0<=k2<n2, 0<=k3<n3)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft3d(n1, n2, n3, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft3d(n1, n2, n3, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        n3     :data length (int)
+                n3 >= 2, n3 = power of 2
+        a[0...n1-1][0...n2-1][0...n3-1]
+               :input/output data (double ***)
+                <case1>
+                    output data
+                        a[k1][k2][2*k3] = R[k1][k2][k3]
+                                        = R[(n1-k1)%n1][(n2-k2)%n2][n3-k3], 
+                        a[k1][k2][2*k3+1] = I[k1][k2][k3]
+                                          = -I[(n1-k1)%n1][(n2-k2)%n2][n3-k3], 
+                            0<=k1<n1, 0<=k2<n2, 0<k3<n3/2, 
+                            (n%m : n mod m), 
+                        a[k1][k2][0] = R[k1][k2][0]
+                                     = R[(n1-k1)%n1][n2-k2][0], 
+                        a[k1][k2][1] = I[k1][k2][0]
+                                     = -I[(n1-k1)%n1][n2-k2][0], 
+                        a[k1][n2-k2][1] = R[(n1-k1)%n1][k2][n3/2]
+                                        = R[k1][n2-k2][n3/2], 
+                        a[k1][n2-k2][0] = -I[(n1-k1)%n1][k2][n3/2]
+                                        = I[k1][n2-k2][n3/2], 
+                            0<=k1<n1, 0<k2<n2/2, 
+                        a[k1][0][0] = R[k1][0][0]
+                                    = R[n1-k1][0][0], 
+                        a[k1][0][1] = I[k1][0][0]
+                                    = -I[n1-k1][0][0], 
+                        a[k1][n2/2][0] = R[k1][n2/2][0]
+                                       = R[n1-k1][n2/2][0], 
+                        a[k1][n2/2][1] = I[k1][n2/2][0]
+                                       = -I[n1-k1][n2/2][0], 
+                        a[n1-k1][0][1] = R[k1][0][n3/2]
+                                       = R[n1-k1][0][n3/2], 
+                        a[n1-k1][0][0] = -I[k1][0][n3/2]
+                                       = I[n1-k1][0][n3/2], 
+                        a[n1-k1][n2/2][1] = R[k1][n2/2][n3/2]
+                                          = R[n1-k1][n2/2][n3/2], 
+                        a[n1-k1][n2/2][0] = -I[k1][n2/2][n3/2]
+                                          = I[n1-k1][n2/2][n3/2], 
+                            0<k1<n1/2, 
+                        a[0][0][0] = R[0][0][0], 
+                        a[0][0][1] = R[0][0][n3/2], 
+                        a[0][n2/2][0] = R[0][n2/2][0], 
+                        a[0][n2/2][1] = R[0][n2/2][n3/2], 
+                        a[n1/2][0][0] = R[n1/2][0][0], 
+                        a[n1/2][0][1] = R[n1/2][0][n3/2], 
+                        a[n1/2][n2/2][0] = R[n1/2][n2/2][0], 
+                        a[n1/2][n2/2][1] = R[n1/2][n2/2][n3/2]
+                <case2>
+                    input data
+                        a[j1][j2][2*j3] = R[j1][j2][j3]
+                                        = R[(n1-j1)%n1][(n2-j2)%n2][n3-j3], 
+                        a[j1][j2][2*j3+1] = I[j1][j2][j3]
+                                          = -I[(n1-j1)%n1][(n2-j2)%n2][n3-j3], 
+                            0<=j1<n1, 0<=j2<n2, 0<j3<n3/2, 
+                        a[j1][j2][0] = R[j1][j2][0]
+                                     = R[(n1-j1)%n1][n2-j2][0], 
+                        a[j1][j2][1] = I[j1][j2][0]
+                                     = -I[(n1-j1)%n1][n2-j2][0], 
+                        a[j1][n2-j2][1] = R[(n1-j1)%n1][j2][n3/2]
+                                        = R[j1][n2-j2][n3/2], 
+                        a[j1][n2-j2][0] = -I[(n1-j1)%n1][j2][n3/2]
+                                        = I[j1][n2-j2][n3/2], 
+                            0<=j1<n1, 0<j2<n2/2, 
+                        a[j1][0][0] = R[j1][0][0]
+                                    = R[n1-j1][0][0], 
+                        a[j1][0][1] = I[j1][0][0]
+                                    = -I[n1-j1][0][0], 
+                        a[j1][n2/2][0] = R[j1][n2/2][0]
+                                       = R[n1-j1][n2/2][0], 
+                        a[j1][n2/2][1] = I[j1][n2/2][0]
+                                       = -I[n1-j1][n2/2][0], 
+                        a[n1-j1][0][1] = R[j1][0][n3/2]
+                                       = R[n1-j1][0][n3/2], 
+                        a[n1-j1][0][0] = -I[j1][0][n3/2]
+                                       = I[n1-j1][0][n3/2], 
+                        a[n1-j1][n2/2][1] = R[j1][n2/2][n3/2]
+                                          = R[n1-j1][n2/2][n3/2], 
+                        a[n1-j1][n2/2][0] = -I[j1][n2/2][n3/2]
+                                          = I[n1-j1][n2/2][n3/2], 
+                            0<j1<n1/2, 
+                        a[0][0][0] = R[0][0][0], 
+                        a[0][0][1] = R[0][0][n3/2], 
+                        a[0][n2/2][0] = R[0][n2/2][0], 
+                        a[0][n2/2][1] = R[0][n2/2][n3/2], 
+                        a[n1/2][0][0] = R[n1/2][0][0], 
+                        a[n1/2][0][1] = R[n1/2][0][n3/2], 
+                        a[n1/2][n2/2][0] = R[n1/2][n2/2][0], 
+                        a[n1/2][n2/2][1] = R[n1/2][n2/2][n3/2]
+                ---- output ordering ----
+                    rdft3d(n1, n2, n3, 1, a, t, ip, w);
+                    rdft3dsort(n1, n2, n3, 1, a);
+                    // stored data is a[0...n1-1][0...n2-1][0...n3+1]:
+                    // a[k1][k2][2*k3] = R[k1][k2][k3], 
+                    // a[k1][k2][2*k3+1] = I[k1][k2][k3], 
+                    // 0<=k1<n1, 0<=k2<n2, 0<=k3<=n3/2.
+                    // the stored data is larger than the input data!
+                ---- input ordering ----
+                    rdft3dsort(n1, n2, n3, -1, a);
+                    rdft3d(n1, n2, n3, -1, a, t, ip, w);
+        t[0...*]
+               :work area (double *)
+                length of t >= max(8*n1, 8*n2),        if single thread, 
+                length of t >= max(8*n1, 8*n2)*FFT3D_MAX_THREADS, 
+                                                       if multi threads, 
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1, n2, n3/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1/2, n2/2, n3/4) + n3/4
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            rdft3d(n1, n2, n3, 1, a, t, ip, w);
+        is 
+            rdft3d(n1, n2, n3, -1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    for (j3 = 0; j3 <= n3 - 1; j3++) {
+                        a[j1][j2][j3] *= 2.0 / n1 / n2 / n3;
+                    }
+                }
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+                                a[j1][j2][j3] * 
+                                cos(pi*j1*(k1+1/2)/n1) * 
+                                cos(pi*j2*(k2+1/2)/n2) * 
+                                cos(pi*j3*(k3+1/2)/n3), 
+                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+        <case2> DCT
+            C[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+                                a[j1][j2][j3] * 
+                                cos(pi*(j1+1/2)*k1/n1) * 
+                                cos(pi*(j2+1/2)*k2/n2) * 
+                                cos(pi*(j3+1/2)*k3/n3), 
+                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct3d(n1, n2, n3, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct3d(n1, n2, n3, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        n3     :data length (int)
+                n3 >= 2, n3 = power of 2
+        a[0...n1-1][0...n2-1][0...n3-1]
+               :input/output data (double ***)
+                output data
+                    a[k1][k2][k3] = C[k1][k2][k3], 
+                        0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+        t[0...*]
+               :work area (double *)
+                length of t >= max(4*n1, 4*n2),        if single thread, 
+                length of t >= max(4*n1, 4*n2)*FFT3D_MAX_THREADS, 
+                                                       if multi threads, 
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1/2, n2/2, n3/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1*3/2, n2*3/2, n3*3/2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddct3d(n1, n2, n3, -1, a, t, ip, w);
+        is 
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    a[j1][j2][0] *= 0.5;
+                }
+                for (j3 = 0; j3 <= n3 - 1; j3++) {
+                    a[j1][0][j3] *= 0.5;
+                }
+            }
+            for (j2 = 0; j2 <= n2 - 1; j2++) {
+                for (j3 = 0; j3 <= n3 - 1; j3++) {
+                    a[0][j2][j3] *= 0.5;
+                }
+            }
+            ddct3d(n1, n2, n3, 1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    for (j3 = 0; j3 <= n3 - 1; j3++) {
+                        a[j1][j2][j3] *= 8.0 / n1 / n2 / n3;
+                    }
+                }
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k1][k2][k3] = sum_j1=1^n1 sum_j2=1^n2 sum_j3=1^n3
+                                A[j1][j2][j3] * 
+                                sin(pi*j1*(k1+1/2)/n1) * 
+                                sin(pi*j2*(k2+1/2)/n2) * 
+                                sin(pi*j3*(k3+1/2)/n3), 
+                                0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+        <case2> DST
+            S[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+                                a[j1][j2][j3] * 
+                                sin(pi*(j1+1/2)*k1/n1) * 
+                                sin(pi*(j2+1/2)*k2/n2) * 
+                                sin(pi*(j3+1/2)*k3/n3), 
+                                0<k1<=n1, 0<k2<=n2, 0<k3<=n3
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst3d(n1, n2, n3, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst3d(n1, n2, n3, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        n3     :data length (int)
+                n3 >= 2, n3 = power of 2
+        a[0...n1-1][0...n2-1][0...n3-1]
+               :input/output data (double ***)
+                <case1>
+                    input data
+                        a[j1%n1][j2%n2][j3%n3] = A[j1][j2][j3], 
+                            0<j1<=n1, 0<j2<=n2, 0<j3<=n3, 
+                            (n%m : n mod m)
+                    output data
+                        a[k1][k2][k3] = S[k1][k2][k3], 
+                            0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+                <case2>
+                    output data
+                        a[k1%n1][k2%n2][k3%n3] = S[k1][k2][k3], 
+                            0<k1<=n1, 0<k2<=n2, 0<k3<=n3
+        t[0...*]
+               :work area (double *)
+                length of t >= max(4*n1, 4*n2),        if single thread, 
+                length of t >= max(4*n1, 4*n2)*FFT3D_MAX_THREADS, 
+                                                       if multi threads, 
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1/2, n2/2, n3/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (double *)
+                length of w >= max(n1*3/2, n2*3/2, n3*3/2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddst3d(n1, n2, n3, -1, a, t, ip, w);
+        is 
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    a[j1][j2][0] *= 0.5;
+                }
+                for (j3 = 0; j3 <= n3 - 1; j3++) {
+                    a[j1][0][j3] *= 0.5;
+                }
+            }
+            for (j2 = 0; j2 <= n2 - 1; j2++) {
+                for (j3 = 0; j3 <= n3 - 1; j3++) {
+                    a[0][j2][j3] *= 0.5;
+                }
+            }
+            ddst3d(n1, n2, n3, 1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    for (j3 = 0; j3 <= n3 - 1; j3++) {
+                        a[j1][j2][j3] *= 8.0 / n1 / n2 / n3;
+                    }
+                }
+            }
+        .
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#define fft3d_alloc_error_check(p) { \
+    if ((p) == NULL) { \
+        fprintf(stderr, "fft3d memory allocation error\n"); \
+        exit(1); \
+    } \
+}
+
+
+#ifdef USE_FFT3D_PTHREADS
+#define USE_FFT3D_THREADS
+#ifndef FFT3D_MAX_THREADS
+#define FFT3D_MAX_THREADS 4
+#endif
+#ifndef FFT3D_THREADS_BEGIN_N
+#define FFT3D_THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#define fft3d_thread_t pthread_t
+#define fft3d_thread_create(thp,func,argp) { \
+    if (pthread_create(thp, NULL, func, (void *) (argp)) != 0) { \
+        fprintf(stderr, "fft3d thread error\n"); \
+        exit(1); \
+    } \
+}
+#define fft3d_thread_wait(th) { \
+    if (pthread_join(th, NULL) != 0) { \
+        fprintf(stderr, "fft3d thread error\n"); \
+        exit(1); \
+    } \
+}
+#endif /* USE_FFT3D_PTHREADS */
+
+
+#ifdef USE_FFT3D_WINTHREADS
+#define USE_FFT3D_THREADS
+#ifndef FFT3D_MAX_THREADS
+#define FFT3D_MAX_THREADS 4
+#endif
+#ifndef FFT3D_THREADS_BEGIN_N
+#define FFT3D_THREADS_BEGIN_N 131072
+#endif
+#include <windows.h>
+#define fft3d_thread_t HANDLE
+#define fft3d_thread_create(thp,func,argp) { \
+    DWORD thid; \
+    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) (func), (LPVOID) (argp), 0, &thid); \
+    if (*(thp) == 0) { \
+        fprintf(stderr, "fft3d thread error\n"); \
+        exit(1); \
+    } \
+}
+#define fft3d_thread_wait(th) { \
+    WaitForSingleObject(th, INFINITE); \
+    CloseHandle(th); \
+}
+#endif /* USE_FFT3D_WINTHREADS */
+
+
+void cdft3d(int n1, int n2, int n3, int isgn, double ***a, 
+    double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+    void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a, 
+        double *t, int *ip, double *w);
+#ifdef USE_FFT3D_THREADS
+    void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+    void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a, 
+        double *t, int *ip, double *w);
+#endif /* USE_FFT3D_THREADS */
+    int n, itnull, nt;
+    
+    n = n1;
+    if (n < n2) {
+        n = n2;
+    }
+    n <<= 1;
+    if (n < n3) {
+        n = n3;
+    }
+    if (n > (ip[0] << 2)) {
+        makewt(n >> 2, ip, w);
+    }
+    itnull = 0;
+    if (t == NULL) {
+        itnull = 1;
+        nt = n1;
+        if (nt < n2) {
+            nt = n2;
+        }
+        nt *= 8;
+#ifdef USE_FFT3D_THREADS
+        nt *= FFT3D_MAX_THREADS;
+#endif /* USE_FFT3D_THREADS */
+        if (n3 == 4) {
+            nt >>= 1;
+        } else if (n3 < 4) {
+            nt >>= 2;
+        }
+        t = (double *) malloc(sizeof(double) * nt);
+        fft3d_alloc_error_check(t);
+    }
+#ifdef USE_FFT3D_THREADS
+    if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
+        xdft3da_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
+        cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
+    } else 
+#endif /* USE_FFT3D_THREADS */
+    {
+        xdft3da_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
+        cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
+    }
+    if (itnull != 0) {
+        free(t);
+    }
+}
+
+
+void rdft3d(int n1, int n2, int n3, int isgn, double ***a, 
+    double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+    void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a, 
+        double *t, int *ip, double *w);
+    void rdft3d_sub(int n1, int n2, int n3, int isgn, double ***a);
+#ifdef USE_FFT3D_THREADS
+    void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+    void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a, 
+        double *t, int *ip, double *w);
+#endif /* USE_FFT3D_THREADS */
+    int n, nw, nc, itnull, nt;
+    
+    n = n1;
+    if (n < n2) {
+        n = n2;
+    }
+    n <<= 1;
+    if (n < n3) {
+        n = n3;
+    }
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n3 > (nc << 2)) {
+        nc = n3 >> 2;
+        makect(nc, ip, w + nw);
+    }
+    itnull = 0;
+    if (t == NULL) {
+        itnull = 1;
+        nt = n1;
+        if (nt < n2) {
+            nt = n2;
+        }
+        nt *= 8;
+#ifdef USE_FFT3D_THREADS
+        nt *= FFT3D_MAX_THREADS;
+#endif /* USE_FFT3D_THREADS */
+        if (n3 == 4) {
+            nt >>= 1;
+        } else if (n3 < 4) {
+            nt >>= 2;
+        }
+        t = (double *) malloc(sizeof(double) * nt);
+        fft3d_alloc_error_check(t);
+    }
+#ifdef USE_FFT3D_THREADS
+    if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
+        if (isgn < 0) {
+            rdft3d_sub(n1, n2, n3, isgn, a);
+            cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
+        }
+        xdft3da_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
+        if (isgn >= 0) {
+            cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
+            rdft3d_sub(n1, n2, n3, isgn, a);
+        }
+    } else 
+#endif /* USE_FFT3D_THREADS */
+    {
+        if (isgn < 0) {
+            rdft3d_sub(n1, n2, n3, isgn, a);
+            cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
+        }
+        xdft3da_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
+        if (isgn >= 0) {
+            cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
+            rdft3d_sub(n1, n2, n3, isgn, a);
+        }
+    }
+    if (itnull != 0) {
+        free(t);
+    }
+}
+
+
+void rdft3dsort(int n1, int n2, int n3, int isgn, double ***a)
+{
+    int n1h, n2h, i, j;
+    double x, y;
+    
+    n1h = n1 >> 1;
+    n2h = n2 >> 1;
+    if (isgn < 0) {
+        for (i = 0; i < n1; i++) {
+            for (j = n2h + 1; j < n2; j++) {
+                a[i][j][0] = a[i][j][n3 + 1];
+                a[i][j][1] = a[i][j][n3];
+            }
+        }
+        for (i = n1h + 1; i < n1; i++) {
+            a[i][0][0] = a[i][0][n3 + 1];
+            a[i][0][1] = a[i][0][n3];
+            a[i][n2h][0] = a[i][n2h][n3 + 1];
+            a[i][n2h][1] = a[i][n2h][n3];
+        }
+        a[0][0][1] = a[0][0][n3];
+        a[0][n2h][1] = a[0][n2h][n3];
+        a[n1h][0][1] = a[n1h][0][n3];
+        a[n1h][n2h][1] = a[n1h][n2h][n3];
+    } else {
+        for (j = n2h + 1; j < n2; j++) {
+            y = a[0][j][0];
+            x = a[0][j][1];
+            a[0][j][n3] = x;
+            a[0][j][n3 + 1] = y;
+            a[0][n2 - j][n3] = x;
+            a[0][n2 - j][n3 + 1] = -y;
+            a[0][j][0] = a[0][n2 - j][0];
+            a[0][j][1] = -a[0][n2 - j][1];
+        }
+        for (i = 1; i < n1; i++) {
+            for (j = n2h + 1; j < n2; j++) {
+                y = a[i][j][0];
+                x = a[i][j][1];
+                a[i][j][n3] = x;
+                a[i][j][n3 + 1] = y;
+                a[n1 - i][n2 - j][n3] = x;
+                a[n1 - i][n2 - j][n3 + 1] = -y;
+                a[i][j][0] = a[n1 - i][n2 - j][0];
+                a[i][j][1] = -a[n1 - i][n2 - j][1];
+            }
+        }
+        for (i = n1h + 1; i < n1; i++) {
+            y = a[i][0][0];
+            x = a[i][0][1];
+            a[i][0][n3] = x;
+            a[i][0][n3 + 1] = y;
+            a[n1 - i][0][n3] = x;
+            a[n1 - i][0][n3 + 1] = -y;
+            a[i][0][0] = a[n1 - i][0][0];
+            a[i][0][1] = -a[n1 - i][0][1];
+            y = a[i][n2h][0];
+            x = a[i][n2h][1];
+            a[i][n2h][n3] = x;
+            a[i][n2h][n3 + 1] = y;
+            a[n1 - i][n2h][n3] = x;
+            a[n1 - i][n2h][n3 + 1] = -y;
+            a[i][n2h][0] = a[n1 - i][n2h][0];
+            a[i][n2h][1] = -a[n1 - i][n2h][1];
+        }
+        a[0][0][n3] = a[0][0][1];
+        a[0][0][n3 + 1] = 0;
+        a[0][0][1] = 0;
+        a[0][n2h][n3] = a[0][n2h][1];
+        a[0][n2h][n3 + 1] = 0;
+        a[0][n2h][1] = 0;
+        a[n1h][0][n3] = a[n1h][0][1];
+        a[n1h][0][n3 + 1] = 0;
+        a[n1h][0][1] = 0;
+        a[n1h][n2h][n3] = a[n1h][n2h][1];
+        a[n1h][n2h][n3 + 1] = 0;
+        a[n1h][n2h][1] = 0;
+    }
+}
+
+
+void ddct3d(int n1, int n2, int n3, int isgn, double ***a, 
+    double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+    void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+#ifdef USE_FFT3D_THREADS
+    void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+    void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+#endif /* USE_FFT3D_THREADS */
+    int n, nw, nc, itnull, nt;
+    
+    n = n1;
+    if (n < n2) {
+        n = n2;
+    }
+    if (n < n3) {
+        n = n3;
+    }
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    itnull = 0;
+    if (t == NULL) {
+        itnull = 1;
+        nt = n1;
+        if (nt < n2) {
+            nt = n2;
+        }
+        nt *= 4;
+#ifdef USE_FFT3D_THREADS
+        nt *= FFT3D_MAX_THREADS;
+#endif /* USE_FFT3D_THREADS */
+        if (n3 == 2) {
+            nt >>= 1;
+        }
+        t = (double *) malloc(sizeof(double) * nt);
+        fft3d_alloc_error_check(t);
+    }
+#ifdef USE_FFT3D_THREADS
+    if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
+        ddxt3da_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
+        ddxt3db_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
+    } else 
+#endif /* USE_FFT3D_THREADS */
+    {
+        ddxt3da_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
+        ddxt3db_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
+    }
+    if (itnull != 0) {
+        free(t);
+    }
+}
+
+
+void ddst3d(int n1, int n2, int n3, int isgn, double ***a, 
+    double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+    void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+#ifdef USE_FFT3D_THREADS
+    void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+    void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn, 
+        double ***a, double *t, int *ip, double *w);
+#endif /* USE_FFT3D_THREADS */
+    int n, nw, nc, itnull, nt;
+    
+    n = n1;
+    if (n < n2) {
+        n = n2;
+    }
+    if (n < n3) {
+        n = n3;
+    }
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    itnull = 0;
+    if (t == NULL) {
+        itnull = 1;
+        nt = n1;
+        if (nt < n2) {
+            nt = n2;
+        }
+        nt *= 4;
+#ifdef USE_FFT3D_THREADS
+        nt *= FFT3D_MAX_THREADS;
+#endif /* USE_FFT3D_THREADS */
+        if (n3 == 2) {
+            nt >>= 1;
+        }
+        t = (double *) malloc(sizeof(double) * nt);
+        fft3d_alloc_error_check(t);
+    }
+#ifdef USE_FFT3D_THREADS
+    if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
+        ddxt3da_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
+        ddxt3db_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
+    } else 
+#endif /* USE_FFT3D_THREADS */
+    {
+        ddxt3da_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
+        ddxt3db_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
+    }
+    if (itnull != 0) {
+        free(t);
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn, 
+    double ***a, double *t, int *ip, double *w)
+{
+    void cdft(int n, int isgn, double *a, int *ip, double *w);
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    int i, j, k;
+    
+    for (i = 0; i < n1; i++) {
+        if (icr == 0) {
+            for (j = 0; j < n2; j++) {
+                cdft(n3, isgn, a[i][j], ip, w);
+            }
+        } else if (isgn >= 0) {
+            for (j = 0; j < n2; j++) {
+                rdft(n3, isgn, a[i][j], ip, w);
+            }
+        }
+        if (n3 > 4) {
+            for (k = 0; k < n3; k += 8) {
+                for (j = 0; j < n2; j++) {
+                    t[2 * j] = a[i][j][k];
+                    t[2 * j + 1] = a[i][j][k + 1];
+                    t[2 * n2 + 2 * j] = a[i][j][k + 2];
+                    t[2 * n2 + 2 * j + 1] = a[i][j][k + 3];
+                    t[4 * n2 + 2 * j] = a[i][j][k + 4];
+                    t[4 * n2 + 2 * j + 1] = a[i][j][k + 5];
+                    t[6 * n2 + 2 * j] = a[i][j][k + 6];
+                    t[6 * n2 + 2 * j + 1] = a[i][j][k + 7];
+                }
+                cdft(2 * n2, isgn, t, ip, w);
+                cdft(2 * n2, isgn, &t[2 * n2], ip, w);
+                cdft(2 * n2, isgn, &t[4 * n2], ip, w);
+                cdft(2 * n2, isgn, &t[6 * n2], ip, w);
+                for (j = 0; j < n2; j++) {
+                    a[i][j][k] = t[2 * j];
+                    a[i][j][k + 1] = t[2 * j + 1];
+                    a[i][j][k + 2] = t[2 * n2 + 2 * j];
+                    a[i][j][k + 3] = t[2 * n2 + 2 * j + 1];
+                    a[i][j][k + 4] = t[4 * n2 + 2 * j];
+                    a[i][j][k + 5] = t[4 * n2 + 2 * j + 1];
+                    a[i][j][k + 6] = t[6 * n2 + 2 * j];
+                    a[i][j][k + 7] = t[6 * n2 + 2 * j + 1];
+                }
+            }
+        } else if (n3 == 4) {
+            for (j = 0; j < n2; j++) {
+                t[2 * j] = a[i][j][0];
+                t[2 * j + 1] = a[i][j][1];
+                t[2 * n2 + 2 * j] = a[i][j][2];
+                t[2 * n2 + 2 * j + 1] = a[i][j][3];
+            }
+            cdft(2 * n2, isgn, t, ip, w);
+            cdft(2 * n2, isgn, &t[2 * n2], ip, w);
+            for (j = 0; j < n2; j++) {
+                a[i][j][0] = t[2 * j];
+                a[i][j][1] = t[2 * j + 1];
+                a[i][j][2] = t[2 * n2 + 2 * j];
+                a[i][j][3] = t[2 * n2 + 2 * j + 1];
+            }
+        } else if (n3 == 2) {
+            for (j = 0; j < n2; j++) {
+                t[2 * j] = a[i][j][0];
+                t[2 * j + 1] = a[i][j][1];
+            }
+            cdft(2 * n2, isgn, t, ip, w);
+            for (j = 0; j < n2; j++) {
+                a[i][j][0] = t[2 * j];
+                a[i][j][1] = t[2 * j + 1];
+            }
+        }
+        if (icr != 0 && isgn < 0) {
+            for (j = 0; j < n2; j++) {
+                rdft(n3, isgn, a[i][j], ip, w);
+            }
+        }
+    }
+}
+
+
+void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a, 
+    double *t, int *ip, double *w)
+{
+    void cdft(int n, int isgn, double *a, int *ip, double *w);
+    int i, j, k;
+    
+    if (n3 > 4) {
+        for (j = 0; j < n2; j++) {
+            for (k = 0; k < n3; k += 8) {
+                for (i = 0; i < n1; i++) {
+                    t[2 * i] = a[i][j][k];
+                    t[2 * i + 1] = a[i][j][k + 1];
+                    t[2 * n1 + 2 * i] = a[i][j][k + 2];
+                    t[2 * n1 + 2 * i + 1] = a[i][j][k + 3];
+                    t[4 * n1 + 2 * i] = a[i][j][k + 4];
+                    t[4 * n1 + 2 * i + 1] = a[i][j][k + 5];
+                    t[6 * n1 + 2 * i] = a[i][j][k + 6];
+                    t[6 * n1 + 2 * i + 1] = a[i][j][k + 7];
+                }
+                cdft(2 * n1, isgn, t, ip, w);
+                cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+                cdft(2 * n1, isgn, &t[4 * n1], ip, w);
+                cdft(2 * n1, isgn, &t[6 * n1], ip, w);
+                for (i = 0; i < n1; i++) {
+                    a[i][j][k] = t[2 * i];
+                    a[i][j][k + 1] = t[2 * i + 1];
+                    a[i][j][k + 2] = t[2 * n1 + 2 * i];
+                    a[i][j][k + 3] = t[2 * n1 + 2 * i + 1];
+                    a[i][j][k + 4] = t[4 * n1 + 2 * i];
+                    a[i][j][k + 5] = t[4 * n1 + 2 * i + 1];
+                    a[i][j][k + 6] = t[6 * n1 + 2 * i];
+                    a[i][j][k + 7] = t[6 * n1 + 2 * i + 1];
+                }
+            }
+        }
+    } else if (n3 == 4) {
+        for (j = 0; j < n2; j++) {
+            for (i = 0; i < n1; i++) {
+                t[2 * i] = a[i][j][0];
+                t[2 * i + 1] = a[i][j][1];
+                t[2 * n1 + 2 * i] = a[i][j][2];
+                t[2 * n1 + 2 * i + 1] = a[i][j][3];
+            }
+            cdft(2 * n1, isgn, t, ip, w);
+            cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+            for (i = 0; i < n1; i++) {
+                a[i][j][0] = t[2 * i];
+                a[i][j][1] = t[2 * i + 1];
+                a[i][j][2] = t[2 * n1 + 2 * i];
+                a[i][j][3] = t[2 * n1 + 2 * i + 1];
+            }
+        }
+    } else if (n3 == 2) {
+        for (j = 0; j < n2; j++) {
+            for (i = 0; i < n1; i++) {
+                t[2 * i] = a[i][j][0];
+                t[2 * i + 1] = a[i][j][1];
+            }
+            cdft(2 * n1, isgn, t, ip, w);
+            for (i = 0; i < n1; i++) {
+                a[i][j][0] = t[2 * i];
+                a[i][j][1] = t[2 * i + 1];
+            }
+        }
+    }
+}
+
+
+void rdft3d_sub(int n1, int n2, int n3, int isgn, double ***a)
+{
+    int n1h, n2h, i, j, k, l;
+    double xi;
+    
+    n1h = n1 >> 1;
+    n2h = n2 >> 1;
+    if (isgn < 0) {
+        for (i = 1; i < n1h; i++) {
+            j = n1 - i;
+            xi = a[i][0][0] - a[j][0][0];
+            a[i][0][0] += a[j][0][0];
+            a[j][0][0] = xi;
+            xi = a[j][0][1] - a[i][0][1];
+            a[i][0][1] += a[j][0][1];
+            a[j][0][1] = xi;
+            xi = a[i][n2h][0] - a[j][n2h][0];
+            a[i][n2h][0] += a[j][n2h][0];
+            a[j][n2h][0] = xi;
+            xi = a[j][n2h][1] - a[i][n2h][1];
+            a[i][n2h][1] += a[j][n2h][1];
+            a[j][n2h][1] = xi;
+            for (k = 1; k < n2h; k++) {
+                l = n2 - k;
+                xi = a[i][k][0] - a[j][l][0];
+                a[i][k][0] += a[j][l][0];
+                a[j][l][0] = xi;
+                xi = a[j][l][1] - a[i][k][1];
+                a[i][k][1] += a[j][l][1];
+                a[j][l][1] = xi;
+                xi = a[j][k][0] - a[i][l][0];
+                a[j][k][0] += a[i][l][0];
+                a[i][l][0] = xi;
+                xi = a[i][l][1] - a[j][k][1];
+                a[j][k][1] += a[i][l][1];
+                a[i][l][1] = xi;
+            }
+        }
+        for (k = 1; k < n2h; k++) {
+            l = n2 - k;
+            xi = a[0][k][0] - a[0][l][0];
+            a[0][k][0] += a[0][l][0];
+            a[0][l][0] = xi;
+            xi = a[0][l][1] - a[0][k][1];
+            a[0][k][1] += a[0][l][1];
+            a[0][l][1] = xi;
+            xi = a[n1h][k][0] - a[n1h][l][0];
+            a[n1h][k][0] += a[n1h][l][0];
+            a[n1h][l][0] = xi;
+            xi = a[n1h][l][1] - a[n1h][k][1];
+            a[n1h][k][1] += a[n1h][l][1];
+            a[n1h][l][1] = xi;
+        }
+    } else {
+        for (i = 1; i < n1h; i++) {
+            j = n1 - i;
+            a[j][0][0] = 0.5 * (a[i][0][0] - a[j][0][0]);
+            a[i][0][0] -= a[j][0][0];
+            a[j][0][1] = 0.5 * (a[i][0][1] + a[j][0][1]);
+            a[i][0][1] -= a[j][0][1];
+            a[j][n2h][0] = 0.5 * (a[i][n2h][0] - a[j][n2h][0]);
+            a[i][n2h][0] -= a[j][n2h][0];
+            a[j][n2h][1] = 0.5 * (a[i][n2h][1] + a[j][n2h][1]);
+            a[i][n2h][1] -= a[j][n2h][1];
+            for (k = 1; k < n2h; k++) {
+                l = n2 - k;
+                a[j][l][0] = 0.5 * (a[i][k][0] - a[j][l][0]);
+                a[i][k][0] -= a[j][l][0];
+                a[j][l][1] = 0.5 * (a[i][k][1] + a[j][l][1]);
+                a[i][k][1] -= a[j][l][1];
+                a[i][l][0] = 0.5 * (a[j][k][0] - a[i][l][0]);
+                a[j][k][0] -= a[i][l][0];
+                a[i][l][1] = 0.5 * (a[j][k][1] + a[i][l][1]);
+                a[j][k][1] -= a[i][l][1];
+            }
+        }
+        for (k = 1; k < n2h; k++) {
+            l = n2 - k;
+            a[0][l][0] = 0.5 * (a[0][k][0] - a[0][l][0]);
+            a[0][k][0] -= a[0][l][0];
+            a[0][l][1] = 0.5 * (a[0][k][1] + a[0][l][1]);
+            a[0][k][1] -= a[0][l][1];
+            a[n1h][l][0] = 0.5 * (a[n1h][k][0] - a[n1h][l][0]);
+            a[n1h][k][0] -= a[n1h][l][0];
+            a[n1h][l][1] = 0.5 * (a[n1h][k][1] + a[n1h][l][1]);
+            a[n1h][k][1] -= a[n1h][l][1];
+        }
+    }
+}
+
+
+void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn, 
+    double ***a, double *t, int *ip, double *w)
+{
+    void ddct(int n, int isgn, double *a, int *ip, double *w);
+    void ddst(int n, int isgn, double *a, int *ip, double *w);
+    int i, j, k;
+    
+    for (i = 0; i < n1; i++) {
+        if (ics == 0) {
+            for (j = 0; j < n2; j++) {
+                ddct(n3, isgn, a[i][j], ip, w);
+            }
+        } else {
+            for (j = 0; j < n2; j++) {
+                ddst(n3, isgn, a[i][j], ip, w);
+            }
+        }
+        if (n3 > 2) {
+            for (k = 0; k < n3; k += 4) {
+                for (j = 0; j < n2; j++) {
+                    t[j] = a[i][j][k];
+                    t[n2 + j] = a[i][j][k + 1];
+                    t[2 * n2 + j] = a[i][j][k + 2];
+                    t[3 * n2 + j] = a[i][j][k + 3];
+                }
+                if (ics == 0) {
+                    ddct(n2, isgn, t, ip, w);
+                    ddct(n2, isgn, &t[n2], ip, w);
+                    ddct(n2, isgn, &t[2 * n2], ip, w);
+                    ddct(n2, isgn, &t[3 * n2], ip, w);
+                } else {
+                    ddst(n2, isgn, t, ip, w);
+                    ddst(n2, isgn, &t[n2], ip, w);
+                    ddst(n2, isgn, &t[2 * n2], ip, w);
+                    ddst(n2, isgn, &t[3 * n2], ip, w);
+                }
+                for (j = 0; j < n2; j++) {
+                    a[i][j][k] = t[j];
+                    a[i][j][k + 1] = t[n2 + j];
+                    a[i][j][k + 2] = t[2 * n2 + j];
+                    a[i][j][k + 3] = t[3 * n2 + j];
+                }
+            }
+        } else if (n3 == 2) {
+            for (j = 0; j < n2; j++) {
+                t[j] = a[i][j][0];
+                t[n2 + j] = a[i][j][1];
+            }
+            if (ics == 0) {
+                ddct(n2, isgn, t, ip, w);
+                ddct(n2, isgn, &t[n2], ip, w);
+            } else {
+                ddst(n2, isgn, t, ip, w);
+                ddst(n2, isgn, &t[n2], ip, w);
+            }
+            for (j = 0; j < n2; j++) {
+                a[i][j][0] = t[j];
+                a[i][j][1] = t[n2 + j];
+            }
+        }
+    }
+}
+
+
+void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn, 
+    double ***a, double *t, int *ip, double *w)
+{
+    void ddct(int n, int isgn, double *a, int *ip, double *w);
+    void ddst(int n, int isgn, double *a, int *ip, double *w);
+    int i, j, k;
+    
+    if (n3 > 2) {
+        for (j = 0; j < n2; j++) {
+            for (k = 0; k < n3; k += 4) {
+                for (i = 0; i < n1; i++) {
+                    t[i] = a[i][j][k];
+                    t[n1 + i] = a[i][j][k + 1];
+                    t[2 * n1 + i] = a[i][j][k + 2];
+                    t[3 * n1 + i] = a[i][j][k + 3];
+                }
+                if (ics == 0) {
+                    ddct(n1, isgn, t, ip, w);
+                    ddct(n1, isgn, &t[n1], ip, w);
+                    ddct(n1, isgn, &t[2 * n1], ip, w);
+                    ddct(n1, isgn, &t[3 * n1], ip, w);
+                } else {
+                    ddst(n1, isgn, t, ip, w);
+                    ddst(n1, isgn, &t[n1], ip, w);
+                    ddst(n1, isgn, &t[2 * n1], ip, w);
+                    ddst(n1, isgn, &t[3 * n1], ip, w);
+                }
+                for (i = 0; i < n1; i++) {
+                    a[i][j][k] = t[i];
+                    a[i][j][k + 1] = t[n1 + i];
+                    a[i][j][k + 2] = t[2 * n1 + i];
+                    a[i][j][k + 3] = t[3 * n1 + i];
+                }
+            }
+        }
+    } else if (n3 == 2) {
+        for (j = 0; j < n2; j++) {
+            for (i = 0; i < n1; i++) {
+                t[i] = a[i][j][0];
+                t[n1 + i] = a[i][j][1];
+            }
+            if (ics == 0) {
+                ddct(n1, isgn, t, ip, w);
+                ddct(n1, isgn, &t[n1], ip, w);
+            } else {
+                ddst(n1, isgn, t, ip, w);
+                ddst(n1, isgn, &t[n1], ip, w);
+            }
+            for (i = 0; i < n1; i++) {
+                a[i][j][0] = t[i];
+                a[i][j][1] = t[n1 + i];
+            }
+        }
+    }
+}
+
+
+#ifdef USE_FFT3D_THREADS
+struct fft3d_arg_st {
+    int nthread;
+    int n0;
+    int n1;
+    int n2;
+    int n3;
+    int ic;
+    int isgn;
+    double ***a;
+    double *t;
+    int *ip;
+    double *w;
+};
+typedef struct fft3d_arg_st fft3d_arg_t;
+
+
+void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn, 
+    double ***a, double *t, int *ip, double *w)
+{
+    void *xdft3da_th(void *p);
+    fft3d_thread_t th[FFT3D_MAX_THREADS];
+    fft3d_arg_t ag[FFT3D_MAX_THREADS];
+    int nthread, nt, i;
+    
+    nthread = FFT3D_MAX_THREADS;
+    if (nthread > n1) {
+        nthread = n1;
+    }
+    nt = 8 * n2;
+    if (n3 == 4) {
+        nt >>= 1;
+    } else if (n3 < 4) {
+        nt >>= 2;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].nthread = nthread;
+        ag[i].n0 = i;
+        ag[i].n1 = n1;
+        ag[i].n2 = n2;
+        ag[i].n3 = n3;
+        ag[i].ic = icr;
+        ag[i].isgn = isgn;
+        ag[i].a = a;
+        ag[i].t = &t[nt * i];
+        ag[i].ip = ip;
+        ag[i].w = w;
+        fft3d_thread_create(&th[i], xdft3da_th, &ag[i]);
+    }
+    for (i = 0; i < nthread; i++) {
+        fft3d_thread_wait(th[i]);
+    }
+}
+
+
+void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a, 
+    double *t, int *ip, double *w)
+{
+    void *cdft3db_th(void *p);
+    fft3d_thread_t th[FFT3D_MAX_THREADS];
+    fft3d_arg_t ag[FFT3D_MAX_THREADS];
+    int nthread, nt, i;
+    
+    nthread = FFT3D_MAX_THREADS;
+    if (nthread > n2) {
+        nthread = n2;
+    }
+    nt = 8 * n1;
+    if (n3 == 4) {
+        nt >>= 1;
+    } else if (n3 < 4) {
+        nt >>= 2;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].nthread = nthread;
+        ag[i].n0 = i;
+        ag[i].n1 = n1;
+        ag[i].n2 = n2;
+        ag[i].n3 = n3;
+        ag[i].isgn = isgn;
+        ag[i].a = a;
+        ag[i].t = &t[nt * i];
+        ag[i].ip = ip;
+        ag[i].w = w;
+        fft3d_thread_create(&th[i], cdft3db_th, &ag[i]);
+    }
+    for (i = 0; i < nthread; i++) {
+        fft3d_thread_wait(th[i]);
+    }
+}
+
+
+void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn, 
+    double ***a, double *t, int *ip, double *w)
+{
+    void *ddxt3da_th(void *p);
+    fft3d_thread_t th[FFT3D_MAX_THREADS];
+    fft3d_arg_t ag[FFT3D_MAX_THREADS];
+    int nthread, nt, i;
+    
+    nthread = FFT3D_MAX_THREADS;
+    if (nthread > n1) {
+        nthread = n1;
+    }
+    nt = 4 * n2;
+    if (n3 == 2) {
+        nt >>= 1;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].nthread = nthread;
+        ag[i].n0 = i;
+        ag[i].n1 = n1;
+        ag[i].n2 = n2;
+        ag[i].n3 = n3;
+        ag[i].ic = ics;
+        ag[i].isgn = isgn;
+        ag[i].a = a;
+        ag[i].t = &t[nt * i];
+        ag[i].ip = ip;
+        ag[i].w = w;
+        fft3d_thread_create(&th[i], ddxt3da_th, &ag[i]);
+    }
+    for (i = 0; i < nthread; i++) {
+        fft3d_thread_wait(th[i]);
+    }
+}
+
+
+void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn, 
+    double ***a, double *t, int *ip, double *w)
+{
+    void *ddxt3db_th(void *p);
+    fft3d_thread_t th[FFT3D_MAX_THREADS];
+    fft3d_arg_t ag[FFT3D_MAX_THREADS];
+    int nthread, nt, i;
+    
+    nthread = FFT3D_MAX_THREADS;
+    if (nthread > n2) {
+        nthread = n2;
+    }
+    nt = 4 * n1;
+    if (n3 == 2) {
+        nt >>= 1;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].nthread = nthread;
+        ag[i].n0 = i;
+        ag[i].n1 = n1;
+        ag[i].n2 = n2;
+        ag[i].n3 = n3;
+        ag[i].ic = ics;
+        ag[i].isgn = isgn;
+        ag[i].a = a;
+        ag[i].t = &t[nt * i];
+        ag[i].ip = ip;
+        ag[i].w = w;
+        fft3d_thread_create(&th[i], ddxt3db_th, &ag[i]);
+    }
+    for (i = 0; i < nthread; i++) {
+        fft3d_thread_wait(th[i]);
+    }
+}
+
+
+void *xdft3da_th(void *p)
+{
+    void cdft(int n, int isgn, double *a, int *ip, double *w);
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    int nthread, n0, n1, n2, n3, icr, isgn, *ip, i, j, k;
+    double ***a, *t, *w;
+    
+    nthread = ((fft3d_arg_t *) p)->nthread;
+    n0 = ((fft3d_arg_t *) p)->n0;
+    n1 = ((fft3d_arg_t *) p)->n1;
+    n2 = ((fft3d_arg_t *) p)->n2;
+    n3 = ((fft3d_arg_t *) p)->n3;
+    icr = ((fft3d_arg_t *) p)->ic;
+    isgn = ((fft3d_arg_t *) p)->isgn;
+    a = ((fft3d_arg_t *) p)->a;
+    t = ((fft3d_arg_t *) p)->t;
+    ip = ((fft3d_arg_t *) p)->ip;
+    w = ((fft3d_arg_t *) p)->w;
+    for (i = n0; i < n1; i += nthread) {
+        if (icr == 0) {
+            for (j = 0; j < n2; j++) {
+                cdft(n3, isgn, a[i][j], ip, w);
+            }
+        } else if (isgn >= 0) {
+            for (j = 0; j < n2; j++) {
+                rdft(n3, isgn, a[i][j], ip, w);
+            }
+        }
+        if (n3 > 4) {
+            for (k = 0; k < n3; k += 8) {
+                for (j = 0; j < n2; j++) {
+                    t[2 * j] = a[i][j][k];
+                    t[2 * j + 1] = a[i][j][k + 1];
+                    t[2 * n2 + 2 * j] = a[i][j][k + 2];
+                    t[2 * n2 + 2 * j + 1] = a[i][j][k + 3];
+                    t[4 * n2 + 2 * j] = a[i][j][k + 4];
+                    t[4 * n2 + 2 * j + 1] = a[i][j][k + 5];
+                    t[6 * n2 + 2 * j] = a[i][j][k + 6];
+                    t[6 * n2 + 2 * j + 1] = a[i][j][k + 7];
+                }
+                cdft(2 * n2, isgn, t, ip, w);
+                cdft(2 * n2, isgn, &t[2 * n2], ip, w);
+                cdft(2 * n2, isgn, &t[4 * n2], ip, w);
+                cdft(2 * n2, isgn, &t[6 * n2], ip, w);
+                for (j = 0; j < n2; j++) {
+                    a[i][j][k] = t[2 * j];
+                    a[i][j][k + 1] = t[2 * j + 1];
+                    a[i][j][k + 2] = t[2 * n2 + 2 * j];
+                    a[i][j][k + 3] = t[2 * n2 + 2 * j + 1];
+                    a[i][j][k + 4] = t[4 * n2 + 2 * j];
+                    a[i][j][k + 5] = t[4 * n2 + 2 * j + 1];
+                    a[i][j][k + 6] = t[6 * n2 + 2 * j];
+                    a[i][j][k + 7] = t[6 * n2 + 2 * j + 1];
+                }
+            }
+        } else if (n3 == 4) {
+            for (j = 0; j < n2; j++) {
+                t[2 * j] = a[i][j][0];
+                t[2 * j + 1] = a[i][j][1];
+                t[2 * n2 + 2 * j] = a[i][j][2];
+                t[2 * n2 + 2 * j + 1] = a[i][j][3];
+            }
+            cdft(2 * n2, isgn, t, ip, w);
+            cdft(2 * n2, isgn, &t[2 * n2], ip, w);
+            for (j = 0; j < n2; j++) {
+                a[i][j][0] = t[2 * j];
+                a[i][j][1] = t[2 * j + 1];
+                a[i][j][2] = t[2 * n2 + 2 * j];
+                a[i][j][3] = t[2 * n2 + 2 * j + 1];
+            }
+        } else if (n3 == 2) {
+            for (j = 0; j < n2; j++) {
+                t[2 * j] = a[i][j][0];
+                t[2 * j + 1] = a[i][j][1];
+            }
+            cdft(2 * n2, isgn, t, ip, w);
+            for (j = 0; j < n2; j++) {
+                a[i][j][0] = t[2 * j];
+                a[i][j][1] = t[2 * j + 1];
+            }
+        }
+        if (icr != 0 && isgn < 0) {
+            for (j = 0; j < n2; j++) {
+                rdft(n3, isgn, a[i][j], ip, w);
+            }
+        }
+    }
+    return (void *) 0;
+}
+
+
+void *cdft3db_th(void *p)
+{
+    void cdft(int n, int isgn, double *a, int *ip, double *w);
+    int nthread, n0, n1, n2, n3, isgn, *ip, i, j, k;
+    double ***a, *t, *w;
+    
+    nthread = ((fft3d_arg_t *) p)->nthread;
+    n0 = ((fft3d_arg_t *) p)->n0;
+    n1 = ((fft3d_arg_t *) p)->n1;
+    n2 = ((fft3d_arg_t *) p)->n2;
+    n3 = ((fft3d_arg_t *) p)->n3;
+    isgn = ((fft3d_arg_t *) p)->isgn;
+    a = ((fft3d_arg_t *) p)->a;
+    t = ((fft3d_arg_t *) p)->t;
+    ip = ((fft3d_arg_t *) p)->ip;
+    w = ((fft3d_arg_t *) p)->w;
+    if (n3 > 4) {
+        for (j = n0; j < n2; j += nthread) {
+            for (k = 0; k < n3; k += 8) {
+                for (i = 0; i < n1; i++) {
+                    t[2 * i] = a[i][j][k];
+                    t[2 * i + 1] = a[i][j][k + 1];
+                    t[2 * n1 + 2 * i] = a[i][j][k + 2];
+                    t[2 * n1 + 2 * i + 1] = a[i][j][k + 3];
+                    t[4 * n1 + 2 * i] = a[i][j][k + 4];
+                    t[4 * n1 + 2 * i + 1] = a[i][j][k + 5];
+                    t[6 * n1 + 2 * i] = a[i][j][k + 6];
+                    t[6 * n1 + 2 * i + 1] = a[i][j][k + 7];
+                }
+                cdft(2 * n1, isgn, t, ip, w);
+                cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+                cdft(2 * n1, isgn, &t[4 * n1], ip, w);
+                cdft(2 * n1, isgn, &t[6 * n1], ip, w);
+                for (i = 0; i < n1; i++) {
+                    a[i][j][k] = t[2 * i];
+                    a[i][j][k + 1] = t[2 * i + 1];
+                    a[i][j][k + 2] = t[2 * n1 + 2 * i];
+                    a[i][j][k + 3] = t[2 * n1 + 2 * i + 1];
+                    a[i][j][k + 4] = t[4 * n1 + 2 * i];
+                    a[i][j][k + 5] = t[4 * n1 + 2 * i + 1];
+                    a[i][j][k + 6] = t[6 * n1 + 2 * i];
+                    a[i][j][k + 7] = t[6 * n1 + 2 * i + 1];
+                }
+            }
+        }
+    } else if (n3 == 4) {
+        for (j = n0; j < n2; j += nthread) {
+            for (i = 0; i < n1; i++) {
+                t[2 * i] = a[i][j][0];
+                t[2 * i + 1] = a[i][j][1];
+                t[2 * n1 + 2 * i] = a[i][j][2];
+                t[2 * n1 + 2 * i + 1] = a[i][j][3];
+            }
+            cdft(2 * n1, isgn, t, ip, w);
+            cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+            for (i = 0; i < n1; i++) {
+                a[i][j][0] = t[2 * i];
+                a[i][j][1] = t[2 * i + 1];
+                a[i][j][2] = t[2 * n1 + 2 * i];
+                a[i][j][3] = t[2 * n1 + 2 * i + 1];
+            }
+        }
+    } else if (n3 == 2) {
+        for (j = n0; j < n2; j += nthread) {
+            for (i = 0; i < n1; i++) {
+                t[2 * i] = a[i][j][0];
+                t[2 * i + 1] = a[i][j][1];
+            }
+            cdft(2 * n1, isgn, t, ip, w);
+            for (i = 0; i < n1; i++) {
+                a[i][j][0] = t[2 * i];
+                a[i][j][1] = t[2 * i + 1];
+            }
+        }
+    }
+    return (void *) 0;
+}
+
+
+void *ddxt3da_th(void *p)
+{
+    void ddct(int n, int isgn, double *a, int *ip, double *w);
+    void ddst(int n, int isgn, double *a, int *ip, double *w);
+    int nthread, n0, n1, n2, n3, ics, isgn, *ip, i, j, k;
+    double ***a, *t, *w;
+    
+    nthread = ((fft3d_arg_t *) p)->nthread;
+    n0 = ((fft3d_arg_t *) p)->n0;
+    n1 = ((fft3d_arg_t *) p)->n1;
+    n2 = ((fft3d_arg_t *) p)->n2;
+    n3 = ((fft3d_arg_t *) p)->n3;
+    ics = ((fft3d_arg_t *) p)->ic;
+    isgn = ((fft3d_arg_t *) p)->isgn;
+    a = ((fft3d_arg_t *) p)->a;
+    t = ((fft3d_arg_t *) p)->t;
+    ip = ((fft3d_arg_t *) p)->ip;
+    w = ((fft3d_arg_t *) p)->w;
+    for (i = n0; i < n1; i += nthread) {
+        if (ics == 0) {
+            for (j = 0; j < n2; j++) {
+                ddct(n3, isgn, a[i][j], ip, w);
+            }
+        } else {
+            for (j = 0; j < n2; j++) {
+                ddst(n3, isgn, a[i][j], ip, w);
+            }
+        }
+        if (n3 > 2) {
+            for (k = 0; k < n3; k += 4) {
+                for (j = 0; j < n2; j++) {
+                    t[j] = a[i][j][k];
+                    t[n2 + j] = a[i][j][k + 1];
+                    t[2 * n2 + j] = a[i][j][k + 2];
+                    t[3 * n2 + j] = a[i][j][k + 3];
+                }
+                if (ics == 0) {
+                    ddct(n2, isgn, t, ip, w);
+                    ddct(n2, isgn, &t[n2], ip, w);
+                    ddct(n2, isgn, &t[2 * n2], ip, w);
+                    ddct(n2, isgn, &t[3 * n2], ip, w);
+                } else {
+                    ddst(n2, isgn, t, ip, w);
+                    ddst(n2, isgn, &t[n2], ip, w);
+                    ddst(n2, isgn, &t[2 * n2], ip, w);
+                    ddst(n2, isgn, &t[3 * n2], ip, w);
+                }
+                for (j = 0; j < n2; j++) {
+                    a[i][j][k] = t[j];
+                    a[i][j][k + 1] = t[n2 + j];
+                    a[i][j][k + 2] = t[2 * n2 + j];
+                    a[i][j][k + 3] = t[3 * n2 + j];
+                }
+            }
+        } else if (n3 == 2) {
+            for (j = 0; j < n2; j++) {
+                t[j] = a[i][j][0];
+                t[n2 + j] = a[i][j][1];
+            }
+            if (ics == 0) {
+                ddct(n2, isgn, t, ip, w);
+                ddct(n2, isgn, &t[n2], ip, w);
+            } else {
+                ddst(n2, isgn, t, ip, w);
+                ddst(n2, isgn, &t[n2], ip, w);
+            }
+            for (j = 0; j < n2; j++) {
+                a[i][j][0] = t[j];
+                a[i][j][1] = t[n2 + j];
+            }
+        }
+    }
+    return (void *) 0;
+}
+
+
+void *ddxt3db_th(void *p)
+{
+    void ddct(int n, int isgn, double *a, int *ip, double *w);
+    void ddst(int n, int isgn, double *a, int *ip, double *w);
+    int nthread, n0, n1, n2, n3, ics, isgn, *ip, i, j, k;
+    double ***a, *t, *w;
+    
+    nthread = ((fft3d_arg_t *) p)->nthread;
+    n0 = ((fft3d_arg_t *) p)->n0;
+    n1 = ((fft3d_arg_t *) p)->n1;
+    n2 = ((fft3d_arg_t *) p)->n2;
+    n3 = ((fft3d_arg_t *) p)->n3;
+    ics = ((fft3d_arg_t *) p)->ic;
+    isgn = ((fft3d_arg_t *) p)->isgn;
+    a = ((fft3d_arg_t *) p)->a;
+    t = ((fft3d_arg_t *) p)->t;
+    ip = ((fft3d_arg_t *) p)->ip;
+    w = ((fft3d_arg_t *) p)->w;
+    if (n3 > 2) {
+        for (j = n0; j < n2; j += nthread) {
+            for (k = 0; k < n3; k += 4) {
+                for (i = 0; i < n1; i++) {
+                    t[i] = a[i][j][k];
+                    t[n1 + i] = a[i][j][k + 1];
+                    t[2 * n1 + i] = a[i][j][k + 2];
+                    t[3 * n1 + i] = a[i][j][k + 3];
+                }
+                if (ics == 0) {
+                    ddct(n1, isgn, t, ip, w);
+                    ddct(n1, isgn, &t[n1], ip, w);
+                    ddct(n1, isgn, &t[2 * n1], ip, w);
+                    ddct(n1, isgn, &t[3 * n1], ip, w);
+                } else {
+                    ddst(n1, isgn, t, ip, w);
+                    ddst(n1, isgn, &t[n1], ip, w);
+                    ddst(n1, isgn, &t[2 * n1], ip, w);
+                    ddst(n1, isgn, &t[3 * n1], ip, w);
+                }
+                for (i = 0; i < n1; i++) {
+                    a[i][j][k] = t[i];
+                    a[i][j][k + 1] = t[n1 + i];
+                    a[i][j][k + 2] = t[2 * n1 + i];
+                    a[i][j][k + 3] = t[3 * n1 + i];
+                }
+            }
+        }
+    } else if (n3 == 2) {
+        for (j = n0; j < n2; j += nthread) {
+            for (i = 0; i < n1; i++) {
+                t[i] = a[i][j][0];
+                t[n1 + i] = a[i][j][1];
+            }
+            if (ics == 0) {
+                ddct(n1, isgn, t, ip, w);
+                ddct(n1, isgn, &t[n1], ip, w);
+            } else {
+                ddst(n1, isgn, t, ip, w);
+                ddst(n1, isgn, &t[n1], ip, w);
+            }
+            for (i = 0; i < n1; i++) {
+                a[i][j][0] = t[i];
+                a[i][j][1] = t[n1 + i];
+            }
+        }
+    }
+    return (void *) 0;
+}
+#endif /* USE_FFT3D_THREADS */
+
diff --git a/fftsg3d.f b/fftsg3d.f
new file mode 100644
index 0000000..f08b1dc
--- /dev/null
+++ b/fftsg3d.f
@@ -0,0 +1,926 @@
+! Fast Fourier/Cosine/Sine Transform
+!     dimension   :three
+!     data length :power of 2
+!     decimation  :frequency
+!     radix       :split-radix, row-column
+!     data        :inplace
+!     table       :use
+! subroutines
+!     cdft3d: Complex Discrete Fourier Transform
+!     rdft3d: Real Discrete Fourier Transform
+!     ddct3d: Discrete Cosine Transform
+!     ddst3d: Discrete Sine Transform
+! necessary package
+!     fftsg.f  : 1D-FFT package
+!
+!
+! -------- Complex DFT (Discrete Fourier Transform) --------
+!     [definition]
+!         <case1>
+!             X(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+!                               x(j1,j2,j3) * 
+!                               exp(2*pi*i*j1*k1/n1) * 
+!                               exp(2*pi*i*j2*k2/n2) * 
+!                               exp(2*pi*i*j3*k3/n3), 
+!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+!         <case2>
+!             X(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+!                               x(j1,j2,j3) * 
+!                               exp(-2*pi*i*j1*k1/n1) * 
+!                               exp(-2*pi*i*j2*k2/n2) * 
+!                               exp(-2*pi*i*j3*k3/n3), 
+!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+!         (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call cdft3d(n1max, n2max, 2*n1, n2, n3, 1, a, t, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call cdft3d(n1max, n2max, 2*n1, n2, n3, -1, a, t, ip, w)
+!     [parameters]
+!         n1max  :row1 size of the 3D array (integer)
+!         n2max  :row2 size of the 3D array (integer)
+!         2*n1   :data length (integer)
+!                 n1 >= 1, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 1, n2 = power of 2
+!         n3     :data length (integer)
+!                 n3 >= 1, n3 = power of 2
+!         a(0:2*n1-1,0:n2-1,0:n3-1)
+!                :input/output data (real*8)
+!                 input data
+!                     a(2*j1,j2,j3) = Re(x(j1,j2,j3)), 
+!                     a(2*j1+1,j2,j3) = Im(x(j1,j2,j3)), 
+!                     0<=j1<n1, 0<=j2<n2, 0<=j3<n3
+!                 output data
+!                     a(2*k1,k2,k3) = Re(X(k1,k2,k3)), 
+!                     a(2*k1+1,k2,k3) = Im(X(k1,k2,k3)), 
+!                     0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+!         t(0:*) :work area (real*8)
+!                 length of t >= max(8*n2, 8*n3)
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1, n2, n3))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1/2, n2/2, n3/2)
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call cdft3d(n1max, n2max, 2*n1, n2, n3, -1, a, t, ip, w)
+!         is 
+!             call cdft3d(n1max, n2max, 2*n1, n2, n3, 1, a, t, ip, w)
+!             do j3 = 0, n3 - 1
+!                 do j2 = 0, n2 - 1
+!                     do j1 = 0, 2 * n1 - 1
+!                         a(j1,j2,j3) = a(j1,j2,j3) * (1.0d0/n1/n2/n3)
+!                     end do
+!                 end do
+!             end do
+!         .
+!
+!
+! -------- Real DFT / Inverse of Real DFT --------
+!     [definition]
+!         <case1> RDFT
+!             R(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+!                               a(j1,j2,j3) * 
+!                               cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
+!                                   2*pi*j3*k3/n3), 
+!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+!             I(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+!                               a(j1,j2,j3) * 
+!                               sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
+!                                   2*pi*j3*k3/n3), 
+!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+!         <case2> IRDFT (excluding scale)
+!             a(k1,k2,k3) = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+!                               (R(j1,j2,j3) * 
+!                               cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
+!                                   2*pi*j3*k3/n3) + 
+!                               I(j1,j2,j3) * 
+!                               sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 + 
+!                                   2*pi*j3*k3/n3)), 
+!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+!         (notes: R(mod(n1-k1,n1),mod(n2-k2,n2),mod(n3-k3,n3)) = R(k1,k2,k3), 
+!                 I(mod(n1-k1,n1),mod(n2-k2,n2),mod(n3-k3,n3)) = -I(k1,k2,k3), 
+!                 0<=k1<n1, 0<=k2<n2, 0<=k3<n3)
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call rdft3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call rdft3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+!     [parameters]
+!         n1max  :row1 size of the 3D array (integer)
+!         n2max  :row2 size of the 3D array (integer)
+!         n1     :data length (integer)
+!                 n1 >= 2, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 2, n2 = power of 2
+!         n3     :data length (integer)
+!                 n3 >= 2, n3 = power of 2
+!         a(0:n1-1,0:n2-1,0:n3-1)
+!                :input/output data (real*8)
+!                 <case1>
+!                     output data
+!                         a(2*k1,k2,k3) = R(k1,k2,k3)
+!                                 = R(n1-k1,mod(n2-k2,n2),mod(n3-k3,n3)), 
+!                         a(2*k1+1,k2,k3) = I(k1,k2,k3)
+!                                 = -I(n1-k1,mod(n2-k2,n2),mod(n3-k3,n3)), 
+!                             0<k1<n1/2, 0<=k2<n2, 0<=k3<n3, 
+!                         a(0,k2,k3) = R(0,k2,k3)
+!                                    = R(0,n2-k2,mod(n3-k3,n3)), 
+!                         a(1,k2,k3) = I(0,k2,k3)
+!                                    = -I(0,n2-k2,mod(n3-k3,n3)), 
+!                         a(1,n2-k2,k3) = R(n1/2,k2,mod(n3-k3,n3))
+!                                       = R(n1/2,n2-k2,k3), 
+!                         a(0,n2-k2,k3) = -I(n1/2,k2,mod(n3-k3,n3))
+!                                       = I(n1/2,n2-k2,k3), 
+!                             0<k2<n2/2, 0<=k3<n3, 
+!                         a(0,0,k3) = R(0,0,k3)
+!                                   = R(0,0,n3-k3), 
+!                         a(1,0,k3) = I(0,0,k3)
+!                                   = -I(0,0,n3-k3), 
+!                         a(0,n2/2,k3) = R(0,n2/2,k3)
+!                                      = R(0,n2/2,n3-k3), 
+!                         a(1,n2/2,k3) = I(0,n2/2,k3)
+!                                      = -I(0,n2/2,n3-k3), 
+!                         a(1,0,n3-k3) = R(n1/2,0,k3)
+!                                      = R(n1/2,0,n3-k3), 
+!                         a(0,0,n3-k3) = -I(n1/2,0,k3)
+!                                      = I(n1/2,0,n3-k3), 
+!                         a(1,n2/2,n3-k3) = R(n1/2,n2/2,k3)
+!                                         = R(n1/2,n2/2,n3-k3), 
+!                         a(0,n2/2,n3-k3) = -I(n1/2,n2/2,k3)
+!                                         = I(n1/2,n2/2,n3-k3), 
+!                             0<k3<n3/2, 
+!                         a(0,0,0) = R(0,0,0), 
+!                         a(1,0,0) = R(n1/2,0,0), 
+!                         a(0,0,n3/2) = R(0,0,n3/2), 
+!                         a(1,0,n3/2) = R(n1/2,0,n3/2), 
+!                         a(0,n2/2,0) = R(0,n2/2,0), 
+!                         a(1,n2/2,0) = R(n1/2,n2/2,0), 
+!                         a(0,n2/2,n3/2) = R(0,n2/2,n3/2), 
+!                         a(1,n2/2,n3/2) = R(n1/2,n2/2,n3/2)
+!                 <case2>
+!                     input data
+!                         a(2*j1,j2,j3) = R(j1,j2,j3)
+!                                 = R(n1-j1,mod(n2-j2,n2),mod(n3-j3,n3)), 
+!                         a(2*j1+1,j2,j3) = I(j1,j2,j3)
+!                                 = -I(n1-j1,mod(n2-j2,n2),mod(n3-j3,n3)), 
+!                             0<j1<n1/2, 0<=j2<n2, 0<=j3<n3, 
+!                         a(0,j2,j3) = R(0,j2,j3)
+!                                    = R(0,n2-j2,mod(n3-j3,n3)), 
+!                         a(1,j2,j3) = I(0,j2,j3)
+!                                    = -I(0,n2-j2,mod(n3-j3,n3)), 
+!                         a(1,n2-j2,j3) = R(n1/2,j2,mod(n3-j3,n3))
+!                                       = R(n1/2,n2-j2,j3), 
+!                         a(0,n2-j2,j3) = -I(n1/2,j2,mod(n3-j3,n3))
+!                                       = I(n1/2,n2-j2,j3), 
+!                             0<j2<n2/2, 0<=j3<n3, 
+!                         a(0,0,j3) = R(0,0,j3)
+!                                   = R(0,0,n3-j3), 
+!                         a(1,0,j3) = I(0,0,j3)
+!                                   = -I(0,0,n3-j3), 
+!                         a(0,n2/2,j3) = R(0,n2/2,j3)
+!                                      = R(0,n2/2,n3-j3), 
+!                         a(1,n2/2,j3) = I(0,n2/2,j3)
+!                                      = -I(0,n2/2,n3-j3), 
+!                         a(1,0,n3-j3) = R(n1/2,0,j3)
+!                                      = R(n1/2,0,n3-j3), 
+!                         a(0,0,n3-j3) = -I(n1/2,0,j3)
+!                                      = I(n1/2,0,n3-j3), 
+!                         a(1,n2/2,n3-j3) = R(n1/2,n2/2,j3)
+!                                         = R(n1/2,n2/2,n3-j3), 
+!                         a(0,n2/2,n3-j3) = -I(n1/2,n2/2,j3)
+!                                         = I(n1/2,n2/2,n3-j3), 
+!                             0<j3<n3/2, 
+!                         a(0,0,0) = R(0,0,0), 
+!                         a(1,0,0) = R(n1/2,0,0), 
+!                         a(0,0,n3/2) = R(0,0,n3/2), 
+!                         a(1,0,n3/2) = R(n1/2,0,n3/2), 
+!                         a(0,n2/2,0) = R(0,n2/2,0), 
+!                         a(1,n2/2,0) = R(n1/2,n2/2,0), 
+!                         a(0,n2/2,n3/2) = R(0,n2/2,n3/2), 
+!                         a(1,n2/2,n3/2) = R(n1/2,n2/2,n3/2)
+!                 ---- output ordering ----
+!                     call rdft3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+!                     call rdft3dsort(n1max, n2max, n1, n2, n3, 1, a)
+!                     ! stored data is a(0:n1-1,0:n2-1,0:n3+1):
+!                     ! a(2*k1,k2,k3) = R(k1,k2,k3), 
+!                     ! a(2*k1+1,k2,k3) = I(k1,k2,k3), 
+!                     ! 0<=k1<=n1/2, 0<=k2<n2, 0<=k3<n3.
+!                     ! the stored data is larger than the input data!
+!                 ---- input ordering ----
+!                     call rdft3dsort(n1max, n2max, n1, n2, n3, -1, a)
+!                     call rdft3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+!         t(0:*) :work area (real*8)
+!                 length of t >= max(8*n2, 8*n3)
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1/2, n2, n3))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1/4, n2/2, n3/2) + n1/4
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call rdft3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+!         is 
+!             call rdft3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+!             do j3 = 0, n3 - 1
+!                 do j2 = 0, n2 - 1
+!                     do j1 = 0, n1 - 1
+!                         a(j1,j2,j3) = a(j1,j2,j3) * (2.0d0/n1/n2/n3)
+!                     end do
+!                 end do
+!             end do
+!         .
+!
+!
+! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+!     [definition]
+!         <case1> IDCT (excluding scale)
+!             C(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+!                               a(j1,j2,j3) * 
+!                               cos(pi*j1*(k1+1/2)/n1) * 
+!                               cos(pi*j2*(k2+1/2)/n2) * 
+!                               cos(pi*j3*(k3+1/2)/n3), 
+!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+!         <case2> DCT
+!             C(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+!                               a(j1,j2,j3) * 
+!                               cos(pi*(j1+1/2)*k1/n1) * 
+!                               cos(pi*(j2+1/2)*k2/n2) * 
+!                               cos(pi*(j3+1/2)*k3/n3), 
+!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddct3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddct3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+!     [parameters]
+!         n1max  :row1 size of the 3D array (integer)
+!         n2max  :row2 size of the 3D array (integer)
+!         n1     :data length (integer)
+!                 n1 >= 2, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 2, n2 = power of 2
+!         n3     :data length (integer)
+!                 n3 >= 2, n3 = power of 2
+!         a(0:n1-1,0:n2-1,0:n3-1)
+!                :input/output data (real*8)
+!                 output data
+!                     a(k1,k2,k3) = C(k1,k2,k3), 
+!                         0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+!         t(0:*) :work area (real*8)
+!                 length of t >= max(4*n2, 4*n3)
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1/2, n2/2, n3/2))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1*3/2, n2*3/2, n3*3/2)
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddct3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+!         is 
+!             do j3 = 0, n3 - 1
+!                 do j2 = 0, n2 - 1
+!                     a(0, j2, j3) = a(0, j2, j3) * 0.5d0
+!                 end do
+!                 do j1 = 0, n1 - 1
+!                     a(j1, 0, j3) = a(j1, 0, j3) * 0.5d0
+!                 end do
+!             end do
+!             do j2 = 0, n2 - 1
+!                 do j1 = 0, n1 - 1
+!                     a(j1, j2, 0) = a(j1, j2, 0) * 0.5d0
+!                 end do
+!             end do
+!             call ddct3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+!             do j3 = 0, n3 - 1
+!                 do j2 = 0, n2 - 1
+!                     do j1 = 0, n1 - 1
+!                         a(j1,j2,j3) = a(j1,j2,j3) * (8.0d0/n1/n2/n3)
+!                     end do
+!                 end do
+!             end do
+!         .
+!
+!
+! -------- DST (Discrete Sine Transform) / Inverse of DST --------
+!     [definition]
+!         <case1> IDST (excluding scale)
+!             S(k1,k2,k3) = sum_j1=1^n1 sum_j2=1^n2 sum_j3=1^n3
+!                               A(j1,j2,j3) * 
+!                               sin(pi*j1*(k1+1/2)/n1) * 
+!                               sin(pi*j2*(k2+1/2)/n2) * 
+!                               sin(pi*j3*(k3+1/2)/n3), 
+!                               0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+!         <case2> DST
+!             S(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+!                               a(j1,j2,j3) * 
+!                               sin(pi*(j1+1/2)*k1/n1) * 
+!                               sin(pi*(j2+1/2)*k2/n2) * 
+!                               sin(pi*(j3+1/2)*k3/n3), 
+!                               0<k1<=n1, 0<k2<=n2, 0<k3<=n3
+!     [usage]
+!         <case1>
+!             ip(0) = 0  ! first time only
+!             call ddst3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+!         <case2>
+!             ip(0) = 0  ! first time only
+!             call ddst3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+!     [parameters]
+!         n1max  :row1 size of the 3D array (integer)
+!         n2max  :row2 size of the 3D array (integer)
+!         n1     :data length (integer)
+!                 n1 >= 2, n1 = power of 2
+!         n2     :data length (integer)
+!                 n2 >= 2, n2 = power of 2
+!         n3     :data length (integer)
+!                 n3 >= 2, n3 = power of 2
+!         a(0:n1-1,0:n2-1,0:n3-1)
+!                :input/output data (real*8)
+!                 <case1>
+!                     input data
+!                         a(mod(j1,n1),mod(j2,n2),mod(j3,n3)) = A(j1,j2,j3), 
+!                             0<j1<=n1, 0<j2<=n2, 0<j3<=n3
+!                     output data
+!                         a(k1,k2,k3) = S(k1,k2,k3), 
+!                             0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+!                 <case2>
+!                     output data
+!                         a(mod(k1,n1),mod(k2,n2),mod(k3,n3)) = S(k1,k2,k3), 
+!                             0<k1<=n1, 0<k2<=n2, 0<k3<=n3
+!         t(0:*) :work area (real*8)
+!                 length of t >= max(4*n2, 4*n3)
+!         ip(0:*):work area for bit reversal (integer)
+!                 length of ip >= 2+sqrt(n)
+!                 (n = max(n1/2, n2/2, n3/2))
+!                 ip(0),ip(1) are pointers of the cos/sin table.
+!         w(0:*) :cos/sin table (real*8)
+!                 length of w >= max(n1*3/2, n2*3/2, n3*3/2)
+!                 w(),ip() are initialized if ip(0) = 0.
+!     [remark]
+!         Inverse of 
+!             call ddst3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+!         is 
+!             do j3 = 0, n3 - 1
+!                 do j2 = 0, n2 - 1
+!                     a(0, j2, j3) = a(0, j2, j3) * 0.5d0
+!                 end do
+!                 do j1 = 0, n1 - 1
+!                     a(j1, 0, j3) = a(j1, 0, j3) * 0.5d0
+!                 end do
+!             end do
+!             do j2 = 0, n2 - 1
+!                 do j1 = 0, n1 - 1
+!                     a(j1, j2, 0) = a(j1, j2, 0) * 0.5d0
+!                 end do
+!             end do
+!             call ddst3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+!             do j3 = 0, n3 - 1
+!                 do j2 = 0, n2 - 1
+!                     do j1 = 0, n1 - 1
+!                         a(j1,j2,j3) = a(j1,j2,j3) * (8.0d0/n1/n2/n3)
+!                     end do
+!                 end do
+!             end do
+!         .
+!
+!
+      subroutine cdft3d(n1max, n2max, n1, n2, n3, isgn, a, 
+     &    t, ip, w)
+      integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *), n
+      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
+     &    t(0 : *), w(0 : *)
+      n = 2 * max(n2, n3)
+      n = max(n, n1)
+      if (n .gt. 4 * ip(0)) then
+          call makewt(n / 4, ip, w)
+      end if
+      call xdft3da_sub(n1max, n2max, n1, n2, n3, 0, 
+     &    isgn, a, t, ip, w)
+      call cdft3db_sub(n1max, n2max, n1, n2, n3, 
+     &    isgn, a, t, ip, w)
+      end
+!
+      subroutine rdft3d(n1max, n2max, n1, n2, n3, isgn, a, 
+     &    t, ip, w)
+      integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *), 
+     &    n, nw, nc
+      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
+     &    t(0 : *), w(0 : *)
+      n = 2 * max(n2, n3)
+      n = max(n, n1)
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n1 .gt. 4 * nc) then
+          nc = n1 / 4
+          call makect(nc, ip, w(nw))
+      end if
+      if (isgn .lt. 0) then
+          call rdft3d_sub(n1max, n2max, n1, n2, n3, isgn, a)
+          call cdft3db_sub(n1max, n2max, n1, n2, n3, 
+     &        isgn, a, t, ip, w)
+          call xdft3da_sub(n1max, n2max, n1, n2, n3, 1, 
+     &        isgn, a, t, ip, w)
+      else
+          call xdft3da_sub(n1max, n2max, n1, n2, n3, 1, 
+     &        isgn, a, t, ip, w)
+          call cdft3db_sub(n1max, n2max, n1, n2, n3, 
+     &        isgn, a, t, ip, w)
+          call rdft3d_sub(n1max, n2max, n1, n2, n3, isgn, a)
+      end if
+      end
+!
+      subroutine rdft3dsort(n1max, n2max, n1, n2, n3, isgn, a)
+      integer n1max, n2max, n1, n2, n3, isgn, n2h, n3h, j, k
+      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), x, y
+      n2h = n2 / 2
+      n3h = n3 / 2
+      if (isgn .lt. 0) then
+          do k = 0, n3 - 1
+              do j = n2h + 1, n2 - 1
+                  a(0, j, k) = a(n1 + 1, j, k)
+                  a(1, j, k) = a(n1, j, k)
+              end do
+          end do
+          do k = n3h + 1, n3 - 1
+              a(0, 0, k) = a(n1 + 1, 0, k)
+              a(1, 0, k) = a(n1, 0, k)
+              a(0, n2h, k) = a(n1 + 1, n2h, k)
+              a(1, n2h, k) = a(n1, n2h, k)
+          end do
+          a(1, 0, 0) = a(n1, 0, 0)
+          a(1, n2h, 0) = a(n1, n2h, 0)
+          a(1, 0, n3h) = a(n1, 0, n3h)
+          a(1, n2h, n3h) = a(n1, n2h, n3h)
+      else
+          do j = n2h + 1, n2 - 1
+              y = a(0, j, 0)
+              x = a(1, j, 0)
+              a(n1, j, 0) = x
+              a(n1 + 1, j, 0) = y
+              a(n1, n2 - j, 0) = x
+              a(n1 + 1, n2 - j, 0) = -y
+              a(0, j, 0) = a(0, n2 - j, 0)
+              a(1, j, 0) = -a(1, n2 - j, 0)
+          end do
+          do k = 1, n3 - 1
+              do j = n2h + 1, n2 - 1
+                  y = a(0, j, k)
+                  x = a(1, j, k)
+                  a(n1, j, k) = x
+                  a(n1 + 1, j, k) = y
+                  a(n1, n2 - j, n3 - k) = x
+                  a(n1 + 1, n2 - j, n3 - k) = -y
+                  a(0, j, k) = a(0, n2 - j, n3 - k)
+                  a(1, j, k) = -a(1, n2 - j, n3 - k)
+              end do
+          end do
+          do k = n3h + 1, n3 - 1
+              y = a(0, 0, k)
+              x = a(1, 0, k)
+              a(n1, 0, k) = x
+              a(n1 + 1, 0, k) = y
+              a(n1, 0, n3 - k) = x
+              a(n1 + 1, 0, n3 - k) = -y
+              a(0, 0, k) = a(0, 0, n3 - k)
+              a(1, 0, k) = -a(1, 0, n3 - k)
+              y = a(0, n2h, k)
+              x = a(1, n2h, k)
+              a(n1, n2h, k) = x
+              a(n1 + 1, n2h, k) = y
+              a(n1, n2h, n3 - k) = x
+              a(n1 + 1, n2h, n3 - k) = -y
+              a(0, n2h, k) = a(0, n2h, n3 - k)
+              a(1, n2h, k) = -a(1, n2h, n3 - k)
+          end do
+          a(n1, 0, 0) = a(1, 0, 0)
+          a(n1 + 1, 0, 0) = 0
+          a(1, 0, 0) = 0
+          a(n1, n2h, 0) = a(1, n2h, 0)
+          a(n1 + 1, n2h, 0) = 0
+          a(1, n2h, 0) = 0
+          a(n1, 0, n3h) = a(1, 0, n3h)
+          a(n1 + 1, 0, n3h) = 0
+          a(1, 0, n3h) = 0
+          a(n1, n2h, n3h) = a(1, n2h, n3h)
+          a(n1 + 1, n2h, n3h) = 0
+          a(1, n2h, n3h) = 0
+      end if
+      end
+!
+      subroutine ddct3d(n1max, n2max, n1, n2, n3, isgn, a, 
+     &    t, ip, w)
+      integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *), 
+     &    n, nw, nc
+      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
+     &    t(0 : *), w(0 : *)
+      n = max(n2, n3)
+      n = max(n, n1)
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. nc) then
+          nc = n
+          call makect(nc, ip, w(nw))
+      end if
+      call ddxt3da_sub(n1max, n2max, n1, n2, n3, 0, 
+     &    isgn, a, t, ip, w)
+      call ddxt3db_sub(n1max, n2max, n1, n2, n3, 0, 
+     &    isgn, a, t, ip, w)
+      end
+!
+      subroutine ddst3d(n1max, n2max, n1, n2, n3, isgn, a, 
+     &    t, ip, w)
+      integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *), 
+     &    n, nw, nc
+      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
+     &    t(0 : *), w(0 : *)
+      n = max(n2, n3)
+      n = max(n, n1)
+      nw = ip(0)
+      if (n .gt. 4 * nw) then
+          nw = n / 4
+          call makewt(nw, ip, w)
+      end if
+      nc = ip(1)
+      if (n .gt. nc) then
+          nc = n
+          call makect(nc, ip, w(nw))
+      end if
+      call ddxt3da_sub(n1max, n2max, n1, n2, n3, 1, 
+     &    isgn, a, t, ip, w)
+      call ddxt3db_sub(n1max, n2max, n1, n2, n3, 1, 
+     &    isgn, a, t, ip, w)
+      end
+!
+! -------- child routines --------
+!
+      subroutine xdft3da_sub(n1max, n2max, n1, n2, n3, icr, 
+     &    isgn, a, t, ip, w)
+      integer n1max, n2max, n1, n2, n3, icr, isgn, 
+     &    ip(0 : *), i, j, k
+      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
+     &    t(0 : *), w(0 : *)
+      do k = 0, n3 - 1
+          if (icr .eq. 0) then
+              do j = 0, n2 - 1
+                  call cdft(n1, isgn, a(0, j, k), ip, w)
+              end do
+          else if (isgn .ge. 0) then
+              do j = 0, n2 - 1
+                  call rdft(n1, isgn, a(0, j, k), ip, w)
+              end do
+          end if
+          if (n1 .gt. 4) then
+              do i = 0, n1 - 8, 8
+                  do j = 0, n2 - 1
+                      t(2 * j) = a(i, j, k)
+                      t(2 * j + 1) = a(i + 1, j, k)
+                      t(2 * n2 + 2 * j) = a(i + 2, j, k)
+                      t(2 * n2 + 2 * j + 1) = a(i + 3, j, k)
+                      t(4 * n2 + 2 * j) = a(i + 4, j, k)
+                      t(4 * n2 + 2 * j + 1) = a(i + 5, j, k)
+                      t(6 * n2 + 2 * j) = a(i + 6, j, k)
+                      t(6 * n2 + 2 * j + 1) = a(i + 7, j, k)
+                  end do
+                  call cdft(2 * n2, isgn, t, ip, w)
+                  call cdft(2 * n2, isgn, t(2 * n2), ip, w)
+                  call cdft(2 * n2, isgn, t(4 * n2), ip, w)
+                  call cdft(2 * n2, isgn, t(6 * n2), ip, w)
+                  do j = 0, n2 - 1
+                      a(i, j, k) = t(2 * j)
+                      a(i + 1, j, k) = t(2 * j + 1)
+                      a(i + 2, j, k) = t(2 * n2 + 2 * j)
+                      a(i + 3, j, k) = t(2 * n2 + 2 * j + 1)
+                      a(i + 4, j, k) = t(4 * n2 + 2 * j)
+                      a(i + 5, j, k) = t(4 * n2 + 2 * j + 1)
+                      a(i + 6, j, k) = t(6 * n2 + 2 * j)
+                      a(i + 7, j, k) = t(6 * n2 + 2 * j + 1)
+                  end do
+              end do
+          else if (n1 .eq. 4) then
+              do j = 0, n2 - 1
+                  t(2 * j) = a(0, j, k)
+                  t(2 * j + 1) = a(1, j, k)
+                  t(2 * n2 + 2 * j) = a(2, j, k)
+                  t(2 * n2 + 2 * j + 1) = a(3, j, k)
+              end do
+              call cdft(2 * n2, isgn, t, ip, w)
+              call cdft(2 * n2, isgn, t(2 * n2), ip, w)
+              do j = 0, n2 - 1
+                  a(0, j, k) = t(2 * j)
+                  a(1, j, k) = t(2 * j + 1)
+                  a(2, j, k) = t(2 * n2 + 2 * j)
+                  a(3, j, k) = t(2 * n2 + 2 * j + 1)
+              end do
+          else if (n1 .eq. 2) then
+              do j = 0, n2 - 1
+                  t(2 * j) = a(0, j, k)
+                  t(2 * j + 1) = a(1, j, k)
+              end do
+              call cdft(2 * n2, isgn, t, ip, w)
+              do j = 0, n2 - 1
+                  a(0, j, k) = t(2 * j)
+                  a(1, j, k) = t(2 * j + 1)
+              end do
+          end if
+          if (icr .ne. 0 .and. isgn .lt. 0) then
+              do j = 0, n2 - 1
+                  call rdft(n1, isgn, a(0, j, k), ip, w)
+              end do
+          end if
+      end do
+      end
+!
+      subroutine cdft3db_sub(n1max, n2max, n1, n2, n3, 
+     &    isgn, a, t, ip, w)
+      integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *), 
+     &    i, j, k
+      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
+     &    t(0 : *), w(0 : *)
+      if (n1 .gt. 4) then
+          do j = 0, n2 - 1
+              do i = 0, n1 - 8, 8
+                  do k = 0, n3 - 1
+                      t(2 * k) = a(i, j, k)
+                      t(2 * k + 1) = a(i + 1, j, k)
+                      t(2 * n3 + 2 * k) = a(i + 2, j, k)
+                      t(2 * n3 + 2 * k + 1) = a(i + 3, j, k)
+                      t(4 * n3 + 2 * k) = a(i + 4, j, k)
+                      t(4 * n3 + 2 * k + 1) = a(i + 5, j, k)
+                      t(6 * n3 + 2 * k) = a(i + 6, j, k)
+                      t(6 * n3 + 2 * k + 1) = a(i + 7, j, k)
+                  end do
+                  call cdft(2 * n3, isgn, t, ip, w)
+                  call cdft(2 * n3, isgn, t(2 * n3), ip, w)
+                  call cdft(2 * n3, isgn, t(4 * n3), ip, w)
+                  call cdft(2 * n3, isgn, t(6 * n3), ip, w)
+                  do k = 0, n3 - 1
+                      a(i, j, k) = t(2 * k)
+                      a(i + 1, j, k) = t(2 * k + 1)
+                      a(i + 2, j, k) = t(2 * n3 + 2 * k)
+                      a(i + 3, j, k) = t(2 * n3 + 2 * k + 1)
+                      a(i + 4, j, k) = t(4 * n3 + 2 * k)
+                      a(i + 5, j, k) = t(4 * n3 + 2 * k + 1)
+                      a(i + 6, j, k) = t(6 * n3 + 2 * k)
+                      a(i + 7, j, k) = t(6 * n3 + 2 * k + 1)
+                  end do
+              end do
+          end do
+      else if (n1 .eq. 4) then
+          do j = 0, n2 - 1
+              do k = 0, n3 - 1
+                  t(2 * k) = a(0, j, k)
+                  t(2 * k + 1) = a(1, j, k)
+                  t(2 * n3 + 2 * k) = a(2, j, k)
+                  t(2 * n3 + 2 * k + 1) = a(3, j, k)
+              end do
+              call cdft(2 * n3, isgn, t, ip, w)
+              call cdft(2 * n3, isgn, t(2 * n3), ip, w)
+              do k = 0, n3 - 1
+                  a(0, j, k) = t(2 * k)
+                  a(1, j, k) = t(2 * k + 1)
+                  a(2, j, k) = t(2 * n3 + 2 * k)
+                  a(3, j, k) = t(2 * n3 + 2 * k + 1)
+              end do
+          end do
+      else if (n1 .eq. 2) then
+          do j = 0, n2 - 1
+              do k = 0, n3 - 1
+                  t(2 * k) = a(0, j, k)
+                  t(2 * k + 1) = a(1, j, k)
+              end do
+              call cdft(2 * n3, isgn, t, ip, w)
+              do k = 0, n3 - 1
+                  a(0, j, k) = t(2 * k)
+                  a(1, j, k) = t(2 * k + 1)
+              end do
+          end do
+      end if
+      end
+!
+      subroutine rdft3d_sub(n1max, n2max, n1, n2, n3, isgn, a)
+      integer n1max, n2max, n1, n2, n3, isgn, 
+     &    n2h, n3h, i, j, k, l
+      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), xi
+      n2h = n2 / 2
+      n3h = n3 / 2
+      if (isgn .lt. 0) then
+          do k = 1, n3h - 1
+              l = n3 - k
+              xi = a(0, 0, k) - a(0, 0, l)
+              a(0, 0, k) = a(0, 0, k) + a(0, 0, l)
+              a(0, 0, l) = xi
+              xi = a(1, 0, l) - a(1, 0, k)
+              a(1, 0, k) = a(1, 0, k) + a(1, 0, l)
+              a(1, 0, l) = xi
+              xi = a(0, n2h, k) - a(0, n2h, l)
+              a(0, n2h, k) = a(0, n2h, k) + a(0, n2h, l)
+              a(0, n2h, l) = xi
+              xi = a(1, n2h, l) - a(1, n2h, k)
+              a(1, n2h, k) = a(1, n2h, k) + a(1, n2h, l)
+              a(1, n2h, l) = xi
+              do i = 1, n2h - 1
+                  j = n2 - i
+                  xi = a(0, i, k) - a(0, j, l)
+                  a(0, i, k) = a(0, i, k) + a(0, j, l)
+                  a(0, j, l) = xi
+                  xi = a(1, j, l) - a(1, i, k)
+                  a(1, i, k) = a(1, i, k) + a(1, j, l)
+                  a(1, j, l) = xi
+                  xi = a(0, i, l) - a(0, j, k)
+                  a(0, i, l) = a(0, i, l) + a(0, j, k)
+                  a(0, j, k) = xi
+                  xi = a(1, j, k) - a(1, i, l)
+                  a(1, i, l) = a(1, i, l) + a(1, j, k)
+                  a(1, j, k) = xi
+              end do
+          end do
+          do i = 1, n2h - 1
+              j = n2 - i
+              xi = a(0, i, 0) - a(0, j, 0)
+              a(0, i, 0) = a(0, i, 0) + a(0, j, 0)
+              a(0, j, 0) = xi
+              xi = a(1, j, 0) - a(1, i, 0)
+              a(1, i, 0) = a(1, i, 0) + a(1, j, 0)
+              a(1, j, 0) = xi
+              xi = a(0, i, n3h) - a(0, j, n3h)
+              a(0, i, n3h) = a(0, i, n3h) + a(0, j, n3h)
+              a(0, j, n3h) = xi
+              xi = a(1, j, n3h) - a(1, i, n3h)
+              a(1, i, n3h) = a(1, i, n3h) + a(1, j, n3h)
+              a(1, j, n3h) = xi
+          end do
+      else
+          do k = 1, n3h - 1
+              l = n3 - k
+              a(0, 0, l) = 0.5d0 * (a(0, 0, k) - a(0, 0, l))
+              a(0, 0, k) = a(0, 0, k) - a(0, 0, l)
+              a(1, 0, l) = 0.5d0 * (a(1, 0, k) + a(1, 0, l))
+              a(1, 0, k) = a(1, 0, k) - a(1, 0, l)
+              a(0, n2h, l) = 0.5d0 * (a(0, n2h, k) - a(0, n2h, l))
+              a(0, n2h, k) = a(0, n2h, k) - a(0, n2h, l)
+              a(1, n2h, l) = 0.5d0 * (a(1, n2h, k) + a(1, n2h, l))
+              a(1, n2h, k) = a(1, n2h, k) - a(1, n2h, l)
+              do i = 1, n2h - 1
+                  j = n2 - i
+                  a(0, j, l) = 0.5d0 * (a(0, i, k) - a(0, j, l))
+                  a(0, i, k) = a(0, i, k) - a(0, j, l)
+                  a(1, j, l) = 0.5d0 * (a(1, i, k) + a(1, j, l))
+                  a(1, i, k) = a(1, i, k) - a(1, j, l)
+                  a(0, j, k) = 0.5d0 * (a(0, i, l) - a(0, j, k))
+                  a(0, i, l) = a(0, i, l) - a(0, j, k)
+                  a(1, j, k) = 0.5d0 * (a(1, i, l) + a(1, j, k))
+                  a(1, i, l) = a(1, i, l) - a(1, j, k)
+              end do
+          end do
+          do i = 1, n2h - 1
+              j = n2 - i
+              a(0, j, 0) = 0.5d0 * (a(0, i, 0) - a(0, j, 0))
+              a(0, i, 0) = a(0, i, 0) - a(0, j, 0)
+              a(1, j, 0) = 0.5d0 * (a(1, i, 0) + a(1, j, 0))
+              a(1, i, 0) = a(1, i, 0) - a(1, j, 0)
+              a(0, j, n3h) = 0.5d0 * (a(0, i, n3h) - a(0, j, n3h))
+              a(0, i, n3h) = a(0, i, n3h) - a(0, j, n3h)
+              a(1, j, n3h) = 0.5d0 * (a(1, i, n3h) + a(1, j, n3h))
+              a(1, i, n3h) = a(1, i, n3h) - a(1, j, n3h)
+          end do
+      end if
+      end
+!
+      subroutine ddxt3da_sub(n1max, n2max, n1, n2, n3, ics, 
+     &    isgn, a, t, ip, w)
+      integer n1max, n2max, n1, n2, n3, ics, isgn, 
+     &    ip(0 : *), i, j, k
+      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
+     &    t(0 : *), w(0 : *)
+      do k = 0, n3 - 1
+          if (ics .eq. 0) then
+              do j = 0, n2 - 1
+                  call ddct(n1, isgn, a(0, j, k), ip, w)
+              end do
+          else
+              do j = 0, n2 - 1
+                  call ddst(n1, isgn, a(0, j, k), ip, w)
+              end do
+          end if
+          if (n1 .gt. 2) then
+              do i = 0, n1 - 4, 4
+                  do j = 0, n2 - 1
+                      t(j) = a(i, j, k)
+                      t(n2 + j) = a(i + 1, j, k)
+                      t(2 * n2 + j) = a(i + 2, j, k)
+                      t(3 * n2 + j) = a(i + 3, j, k)
+                  end do
+                  if (ics .eq. 0) then
+                      call ddct(n2, isgn, t, ip, w)
+                      call ddct(n2, isgn, t(n2), ip, w)
+                      call ddct(n2, isgn, t(2 * n2), ip, w)
+                      call ddct(n2, isgn, t(3 * n2), ip, w)
+                  else
+                      call ddst(n2, isgn, t, ip, w)
+                      call ddst(n2, isgn, t(n2), ip, w)
+                      call ddst(n2, isgn, t(2 * n2), ip, w)
+                      call ddst(n2, isgn, t(3 * n2), ip, w)
+                  end if
+                  do j = 0, n2 - 1
+                      a(i, j, k) = t(j)
+                      a(i + 1, j, k) = t(n2 + j)
+                      a(i + 2, j, k) = t(2 * n2 + j)
+                      a(i + 3, j, k) = t(3 * n2 + j)
+                  end do
+              end do
+          else if (n1 .eq. 2) then
+              do j = 0, n2 - 1
+                  t(j) = a(0, j, k)
+                  t(n2 + j) = a(1, j, k)
+              end do
+              if (ics .eq. 0) then
+                  call ddct(n2, isgn, t, ip, w)
+                  call ddct(n2, isgn, t(n2), ip, w)
+              else
+                  call ddst(n2, isgn, t, ip, w)
+                  call ddst(n2, isgn, t(n2), ip, w)
+              end if
+              do j = 0, n2 - 1
+                  a(0, j, k) = t(j)
+                  a(1, j, k) = t(n2 + j)
+              end do
+          end if
+      end do
+      end
+!
+      subroutine ddxt3db_sub(n1max, n2max, n1, n2, n3, ics, 
+     &    isgn, a, t, ip, w)
+      integer n1max, n2max, n1, n2, n3, ics, isgn, 
+     &    ip(0 : *), i, j, k
+      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), 
+     &    t(0 : *), w(0 : *)
+      if (n1 .gt. 2) then
+          do j = 0, n2 - 1
+              do i = 0, n1 - 4, 4
+                  do k = 0, n3 - 1
+                      t(k) = a(i, j, k)
+                      t(n3 + k) = a(i + 1, j, k)
+                      t(2 * n3 + k) = a(i + 2, j, k)
+                      t(3 * n3 + k) = a(i + 3, j, k)
+                  end do
+                  if (ics .eq. 0) then
+                      call ddct(n3, isgn, t, ip, w)
+                      call ddct(n3, isgn, t(n3), ip, w)
+                      call ddct(n3, isgn, t(2 * n3), ip, w)
+                      call ddct(n3, isgn, t(3 * n3), ip, w)
+                  else
+                      call ddst(n3, isgn, t, ip, w)
+                      call ddst(n3, isgn, t(n3), ip, w)
+                      call ddst(n3, isgn, t(2 * n3), ip, w)
+                      call ddst(n3, isgn, t(3 * n3), ip, w)
+                  end if
+                  do k = 0, n3 - 1
+                      a(i, j, k) = t(k)
+                      a(i + 1, j, k) = t(n3 + k)
+                      a(i + 2, j, k) = t(2 * n3 + k)
+                      a(i + 3, j, k) = t(3 * n3 + k)
+                  end do
+              end do
+          end do
+      else if (n1 .eq. 2) then
+          do j = 0, n2 - 1
+              do k = 0, n3 - 1
+                  t(k) = a(0, j, k)
+                  t(n3 + k) = a(1, j, k)
+              end do
+              if (ics .eq. 0) then
+                  call ddct(n3, isgn, t, ip, w)
+                  call ddct(n3, isgn, t(n3), ip, w)
+              else
+                  call ddst(n3, isgn, t, ip, w)
+                  call ddst(n3, isgn, t(n3), ip, w)
+              end if
+              do k = 0, n3 - 1
+                  a(0, j, k) = t(k)
+                  a(1, j, k) = t(n3 + k)
+              end do
+          end do
+      end if
+      end
+!
diff --git a/fftsg_h.c b/fftsg_h.c
new file mode 100644
index 0000000..e4c47c2
--- /dev/null
+++ b/fftsg_h.c
@@ -0,0 +1,3447 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :split-radix
+    data        :inplace
+    table       :not use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, double *);
+    void rdft(int, int, double *);
+    void ddct(int, int, double *);
+    void ddst(int, int, double *);
+    void dfct(int, double *);
+    void dfst(int, double *);
+macro definitions
+    USE_CDFT_PTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=8192
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536
+    USE_CDFT_WINTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=32768
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            cdft(2*n, 1, a);
+        <case2>
+            cdft(2*n, -1, a);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (double *)
+                        input data
+                            a[2*j] = Re(x[j]), 
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]), 
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+    [remark]
+        Inverse of 
+            cdft(2*n, -1, a);
+        is 
+            cdft(2*n, 1, a);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + 
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + 
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            rdft(n, 1, a);
+        <case2>
+            rdft(n, -1, a);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+    [remark]
+        Inverse of 
+            rdft(n, 1, a);
+        is 
+            rdft(n, -1, a);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ddct(n, 1, a);
+        <case2>
+            ddct(n, -1, a);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+    [remark]
+        Inverse of 
+            ddct(n, -1, a);
+        is 
+            a[0] *= 0.5;
+            ddct(n, 1, a);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ddst(n, 1, a);
+        <case2>
+            ddst(n, -1, a);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+    [remark]
+        Inverse of 
+            ddst(n, -1, a);
+        is 
+            a[0] *= 0.5;
+            ddst(n, 1, a);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        dfct(n, a);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+    [remark]
+        Inverse of 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a);
+        is 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        dfst(n, a);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+    [remark]
+        Inverse of 
+            dfst(n, a);
+        is 
+            dfst(n, a);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+*/
+
+
+void cdft(int n, int isgn, double *a)
+{
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    
+    if (isgn >= 0) {
+        cftfsub(n, a);
+    } else {
+        cftbsub(n, a);
+    }
+}
+
+
+void rdft(int n, int isgn, double *a)
+{
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    void rftfsub(int n, double *a);
+    void rftbsub(int n, double *a);
+    double xi;
+    
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a);
+            rftfsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+        xi = a[0] - a[1];
+        a[0] += a[1];
+        a[1] = xi;
+    } else {
+        a[1] = 0.5 * (a[0] - a[1]);
+        a[0] -= a[1];
+        if (n > 4) {
+            rftbsub(n, a);
+            cftbsub(n, a);
+        } else if (n == 4) {
+            cftbsub(n, a);
+        }
+    }
+}
+
+
+void ddct(int n, int isgn, double *a)
+{
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    void rftfsub(int n, double *a);
+    void rftbsub(int n, double *a);
+    void dctsub(int n, double *a);
+    void dctsub4(int n, double *a);
+    int j;
+    double xr;
+    
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = a[j] - a[j - 1];
+            a[j] += a[j - 1];
+        }
+        a[1] = a[0] - xr;
+        a[0] += xr;
+        if (n > 4) {
+            rftbsub(n, a);
+            cftbsub(n, a);
+        } else if (n == 4) {
+            cftbsub(n, a);
+        }
+    }
+    if (n > 4) {
+        dctsub(n, a);
+    } else {
+        dctsub4(n, a);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a);
+            rftfsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = a[j] - a[j + 1];
+            a[j] += a[j + 1];
+        }
+        a[n - 1] = xr;
+    }
+}
+
+
+void ddst(int n, int isgn, double *a)
+{
+    void cftfsub(int n, double *a);
+    void cftbsub(int n, double *a);
+    void rftfsub(int n, double *a);
+    void rftbsub(int n, double *a);
+    void dstsub(int n, double *a);
+    void dstsub4(int n, double *a);
+    int j;
+    double xr;
+    
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = -a[j] - a[j - 1];
+            a[j] -= a[j - 1];
+        }
+        a[1] = a[0] + xr;
+        a[0] -= xr;
+        if (n > 4) {
+            rftbsub(n, a);
+            cftbsub(n, a);
+        } else if (n == 4) {
+            cftbsub(n, a);
+        }
+    }
+    if (n > 4) {
+        dstsub(n, a);
+    } else {
+        dstsub4(n, a);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a);
+            rftfsub(n, a);
+        } else if (n == 4) {
+            cftfsub(n, a);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = -a[j] - a[j + 1];
+            a[j] -= a[j + 1];
+        }
+        a[n - 1] = -xr;
+    }
+}
+
+
+void dfct(int n, double *a)
+{
+    void ddct(int n, int isgn, double *a);
+    void bitrv1(int n, double *a);
+    int j, k, m, mh;
+    double xr, xi, yr, yi, an;
+    
+    m = n >> 1;
+    for (j = 0; j < m; j++) {
+        k = n - j;
+        xr = a[j] + a[k];
+        a[j] -= a[k];
+        a[k] = xr;
+    }
+    an = a[n];
+    while (m >= 2) {
+        ddct(m, 1, a);
+        if (m > 2) {
+            bitrv1(m, a);
+        }
+        mh = m >> 1;
+        xi = a[m];
+        a[m] = a[0];
+        a[0] = an - xi;
+        an += xi;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[m + k];
+            xi = a[m + j];
+            yr = a[j];
+            yi = a[k];
+            a[m + j] = yr;
+            a[m + k] = yi;
+            a[j] = xr - xi;
+            a[k] = xr + xi;
+        }
+        xr = a[mh];
+        a[mh] = a[m + mh];
+        a[m + mh] = xr;
+        m = mh;
+    }
+    xi = a[1];
+    a[1] = a[0];
+    a[0] = an + xi;
+    a[n] = an - xi;
+    if (n > 2) {
+        bitrv1(n, a);
+    }
+}
+
+
+void dfst(int n, double *a)
+{
+    void ddst(int n, int isgn, double *a);
+    void bitrv1(int n, double *a);
+    int j, k, m, mh;
+    double xr, xi, yr, yi;
+    
+    m = n >> 1;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        xr = a[j] - a[k];
+        a[j] += a[k];
+        a[k] = xr;
+    }
+    a[0] = a[m];
+    while (m >= 2) {
+        ddst(m, 1, a);
+        if (m > 2) {
+            bitrv1(m, a);
+        }
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[m + k];
+            xi = a[m + j];
+            yr = a[j];
+            yi = a[k];
+            a[m + j] = yr;
+            a[m + k] = yi;
+            a[j] = xr + xi;
+            a[k] = xr - xi;
+        }
+        a[m] = a[0];
+        a[0] = a[m + mh];
+        a[m + mh] = a[mh];
+        m = mh;
+    }
+    a[1] = a[0];
+    a[0] = 0;
+    if (n > 2) {
+        bitrv1(n, a);
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+#include <math.h>
+#ifndef M_PI_2
+#define M_PI_2      1.570796326794896619231321691639751442098584699687
+#endif
+#ifndef WR5000  /* cos(M_PI_2*0.5000) */
+#define WR5000      0.707106781186547524400844362104849039284835937688
+#endif
+#ifndef WR2500  /* cos(M_PI_2*0.2500) */
+#define WR2500      0.923879532511286756128183189396788286822416625863
+#endif
+#ifndef WI2500  /* sin(M_PI_2*0.2500) */
+#define WI2500      0.382683432365089771728459984030398866761344562485
+#endif
+#ifndef WR1250  /* cos(M_PI_2*0.1250) */
+#define WR1250      0.980785280403230449126182236134239036973933730893
+#endif
+#ifndef WI1250  /* sin(M_PI_2*0.1250) */
+#define WI1250      0.195090322016128267848284868477022240927691617751
+#endif
+#ifndef WR3750  /* cos(M_PI_2*0.3750) */
+#define WR3750      0.831469612302545237078788377617905756738560811987
+#endif
+#ifndef WI3750  /* sin(M_PI_2*0.3750) */
+#define WI3750      0.555570233019602224742830813948532874374937190754
+#endif
+
+
+#ifdef USE_CDFT_PTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 8192
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t pthread_t
+#define cdft_thread_create(thp,func,argp) { \
+    if (pthread_create(thp, NULL, func, (void *) argp) != 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#define cdft_thread_wait(th) { \
+    if (pthread_join(th, NULL) != 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#endif /* USE_CDFT_PTHREADS */
+
+
+#ifdef USE_CDFT_WINTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 32768
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 524288
+#endif
+#include <windows.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t HANDLE
+#define cdft_thread_create(thp,func,argp) { \
+    DWORD thid; \
+    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \
+    if (*(thp) == 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#define cdft_thread_wait(th) { \
+    WaitForSingleObject(th, INFINITE); \
+    CloseHandle(th); \
+}
+#endif /* USE_CDFT_WINTHREADS */
+
+
+#ifndef CDFT_LOOP_DIV  /* control of the CDFT's speed & tolerance */
+#define CDFT_LOOP_DIV 32
+#endif
+
+#ifndef RDFT_LOOP_DIV  /* control of the RDFT's speed & tolerance */
+#define RDFT_LOOP_DIV 64
+#endif
+
+#ifndef DCST_LOOP_DIV  /* control of the DCT,DST's speed & tolerance */
+#define DCST_LOOP_DIV 64
+#endif
+
+
+void cftfsub(int n, double *a)
+{
+    void bitrv2(int n, double *a);
+    void bitrv216(double *a);
+    void bitrv208(double *a);
+    void cftmdl1(int n, double *a);
+    void cftrec4(int n, double *a);
+    void cftleaf(int n, int isplt, double *a);
+    void cftfx41(int n, double *a);
+    void cftf161(double *a);
+    void cftf081(double *a);
+    void cftf040(double *a);
+    void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+    void cftrec4_th(int n, double *a);
+#endif /* USE_CDFT_THREADS */
+    
+    if (n > 8) {
+        if (n > 32) {
+            cftmdl1(n, a);
+#ifdef USE_CDFT_THREADS
+            if (n > CDFT_THREADS_BEGIN_N) {
+                cftrec4_th(n, a);
+            } else 
+#endif /* USE_CDFT_THREADS */
+            if (n > 512) {
+                cftrec4(n, a);
+            } else if (n > 128) {
+                cftleaf(n, 1, a);
+            } else {
+                cftfx41(n, a);
+            }
+            bitrv2(n, a);
+        } else if (n == 32) {
+            cftf161(a);
+            bitrv216(a);
+        } else {
+            cftf081(a);
+            bitrv208(a);
+        }
+    } else if (n == 8) {
+        cftf040(a);
+    } else if (n == 4) {
+        cftx020(a);
+    }
+}
+
+
+void cftbsub(int n, double *a)
+{
+    void bitrv2conj(int n, double *a);
+    void bitrv216neg(double *a);
+    void bitrv208neg(double *a);
+    void cftb1st(int n, double *a);
+    void cftrec4(int n, double *a);
+    void cftleaf(int n, int isplt, double *a);
+    void cftfx41(int n, double *a);
+    void cftf161(double *a);
+    void cftf081(double *a);
+    void cftb040(double *a);
+    void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+    void cftrec4_th(int n, double *a);
+#endif /* USE_CDFT_THREADS */
+    
+    if (n > 8) {
+        if (n > 32) {
+            cftb1st(n, a);
+#ifdef USE_CDFT_THREADS
+            if (n > CDFT_THREADS_BEGIN_N) {
+                cftrec4_th(n, a);
+            } else 
+#endif /* USE_CDFT_THREADS */
+            if (n > 512) {
+                cftrec4(n, a);
+            } else if (n > 128) {
+                cftleaf(n, 1, a);
+            } else {
+                cftfx41(n, a);
+            }
+            bitrv2conj(n, a);
+        } else if (n == 32) {
+            cftf161(a);
+            bitrv216neg(a);
+        } else {
+            cftf081(a);
+            bitrv208neg(a);
+        }
+    } else if (n == 8) {
+        cftb040(a);
+    } else if (n == 4) {
+        cftx020(a);
+    }
+}
+
+
+void bitrv2(int n, double *a)
+{
+    int j0, k0, j1, k1, l, m, i, j, k, nh;
+    double xr, xi, yr, yi;
+    
+    m = 4;
+    for (l = n >> 2; l > 8; l >>= 2) {
+        m <<= 1;
+    }
+    nh = n >> 1;
+    if (l == 8) {
+        j0 = 0;
+        for (k0 = 0; k0 < m; k0 += 4) {
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 4) {
+                xr = a[j];
+                xi = a[j + 1];
+                yr = a[k];
+                yi = a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 -= m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 += m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 -= m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 += m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= 2 * m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = nh >> 1; i > (k ^= i); i >>= 1);
+            }
+            k1 = j0 + k0;
+            j1 = k1 + 2;
+            k1 += nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += m;
+            k1 += 2 * m;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += m;
+            k1 -= m;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= 2;
+            k1 -= nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nh + 2;
+            k1 += nh + 2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= nh - m;
+            k1 += 2 * m - 2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            for (i = nh >> 1; i > (j0 ^= i); i >>= 1);
+        }
+    } else {
+        j0 = 0;
+        for (k0 = 0; k0 < m; k0 += 4) {
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 4) {
+                xr = a[j];
+                xi = a[j + 1];
+                yr = a[k];
+                yi = a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= m;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = nh >> 1; i > (k ^= i); i >>= 1);
+            }
+            k1 = j0 + k0;
+            j1 = k1 + 2;
+            k1 += nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += m;
+            k1 += m;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            for (i = nh >> 1; i > (j0 ^= i); i >>= 1);
+        }
+    }
+}
+
+
+void bitrv2conj(int n, double *a)
+{
+    int j0, k0, j1, k1, l, m, i, j, k, nh;
+    double xr, xi, yr, yi;
+    
+    m = 4;
+    for (l = n >> 2; l > 8; l >>= 2) {
+        m <<= 1;
+    }
+    nh = n >> 1;
+    if (l == 8) {
+        j0 = 0;
+        for (k0 = 0; k0 < m; k0 += 4) {
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 4) {
+                xr = a[j];
+                xi = -a[j + 1];
+                yr = a[k];
+                yi = -a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 -= m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 += m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 -= m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 += m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= 2 * m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = nh >> 1; i > (k ^= i); i >>= 1);
+            }
+            k1 = j0 + k0;
+            j1 = k1 + 2;
+            k1 += nh;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            j1 += m;
+            k1 += 2 * m;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += m;
+            k1 -= m;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= 2;
+            k1 -= nh;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nh + 2;
+            k1 += nh + 2;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= nh - m;
+            k1 += 2 * m - 2;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            for (i = nh >> 1; i > (j0 ^= i); i >>= 1);
+        }
+    } else {
+        j0 = 0;
+        for (k0 = 0; k0 < m; k0 += 4) {
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 4) {
+                xr = a[j];
+                xi = -a[j + 1];
+                yr = a[k];
+                yi = -a[k + 1];
+                a[j] = yr;
+                a[j + 1] = yi;
+                a[k] = xr;
+                a[k + 1] = xi;
+                j1 = j + m;
+                k1 = k + m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m;
+                k1 += m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= m;
+                k1 -= m;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                for (i = nh >> 1; i > (k ^= i); i >>= 1);
+            }
+            k1 = j0 + k0;
+            j1 = k1 + 2;
+            k1 += nh;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            j1 += m;
+            k1 += m;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            for (i = nh >> 1; i > (j0 ^= i); i >>= 1);
+        }
+    }
+}
+
+
+void bitrv216(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
+        x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i, 
+        x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x7r = a[14];
+    x7i = a[15];
+    x8r = a[16];
+    x8i = a[17];
+    x10r = a[20];
+    x10i = a[21];
+    x11r = a[22];
+    x11i = a[23];
+    x12r = a[24];
+    x12i = a[25];
+    x13r = a[26];
+    x13i = a[27];
+    x14r = a[28];
+    x14i = a[29];
+    a[2] = x8r;
+    a[3] = x8i;
+    a[4] = x4r;
+    a[5] = x4i;
+    a[6] = x12r;
+    a[7] = x12i;
+    a[8] = x2r;
+    a[9] = x2i;
+    a[10] = x10r;
+    a[11] = x10i;
+    a[14] = x14r;
+    a[15] = x14i;
+    a[16] = x1r;
+    a[17] = x1i;
+    a[20] = x5r;
+    a[21] = x5i;
+    a[22] = x13r;
+    a[23] = x13i;
+    a[24] = x3r;
+    a[25] = x3i;
+    a[26] = x11r;
+    a[27] = x11i;
+    a[28] = x7r;
+    a[29] = x7i;
+}
+
+
+void bitrv216neg(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
+        x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i, 
+        x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, 
+        x13r, x13i, x14r, x14i, x15r, x15i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x6r = a[12];
+    x6i = a[13];
+    x7r = a[14];
+    x7i = a[15];
+    x8r = a[16];
+    x8i = a[17];
+    x9r = a[18];
+    x9i = a[19];
+    x10r = a[20];
+    x10i = a[21];
+    x11r = a[22];
+    x11i = a[23];
+    x12r = a[24];
+    x12i = a[25];
+    x13r = a[26];
+    x13i = a[27];
+    x14r = a[28];
+    x14i = a[29];
+    x15r = a[30];
+    x15i = a[31];
+    a[2] = x15r;
+    a[3] = x15i;
+    a[4] = x7r;
+    a[5] = x7i;
+    a[6] = x11r;
+    a[7] = x11i;
+    a[8] = x3r;
+    a[9] = x3i;
+    a[10] = x13r;
+    a[11] = x13i;
+    a[12] = x5r;
+    a[13] = x5i;
+    a[14] = x9r;
+    a[15] = x9i;
+    a[16] = x1r;
+    a[17] = x1i;
+    a[18] = x14r;
+    a[19] = x14i;
+    a[20] = x6r;
+    a[21] = x6i;
+    a[22] = x10r;
+    a[23] = x10i;
+    a[24] = x2r;
+    a[25] = x2i;
+    a[26] = x12r;
+    a[27] = x12i;
+    a[28] = x4r;
+    a[29] = x4i;
+    a[30] = x8r;
+    a[31] = x8i;
+}
+
+
+void bitrv208(double *a)
+{
+    double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x6r = a[12];
+    x6i = a[13];
+    a[2] = x4r;
+    a[3] = x4i;
+    a[6] = x6r;
+    a[7] = x6i;
+    a[8] = x1r;
+    a[9] = x1i;
+    a[12] = x3r;
+    a[13] = x3i;
+}
+
+
+void bitrv208neg(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
+        x5r, x5i, x6r, x6i, x7r, x7i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x6r = a[12];
+    x6i = a[13];
+    x7r = a[14];
+    x7i = a[15];
+    a[2] = x7r;
+    a[3] = x7i;
+    a[4] = x3r;
+    a[5] = x3i;
+    a[6] = x5r;
+    a[7] = x5i;
+    a[8] = x1r;
+    a[9] = x1i;
+    a[10] = x6r;
+    a[11] = x6i;
+    a[12] = x2r;
+    a[13] = x2i;
+    a[14] = x4r;
+    a[15] = x4i;
+}
+
+
+void bitrv1(int n, double *a)
+{
+    int j0, k0, j1, k1, l, m, i, j, k, nh;
+    double x;
+    
+    nh = n >> 1;
+    x = a[1];
+    a[1] = a[nh];
+    a[nh] = x;
+    m = 2;
+    for (l = n >> 2; l > 2; l >>= 2) {
+        m <<= 1;
+    }
+    if (l == 2) {
+        j1 = m + 1;
+        k1 = m + nh;
+        x = a[j1];
+        a[j1] = a[k1];
+        a[k1] = x;
+        j0 = 0;
+        for (k0 = 2; k0 < m; k0 += 2) {
+            for (i = nh >> 1; i > (j0 ^= i); i >>= 1);
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 2) {
+                x = a[j];
+                a[j] = a[k];
+                a[k] = x;
+                j1 = j + m;
+                k1 = k + m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1 += nh;
+                k1++;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1 -= m;
+                k1 -= m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1++;
+                k1 += nh;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1 += m;
+                k1 += m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1 -= nh;
+                k1--;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1 -= m;
+                k1 -= m;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                for (i = nh >> 1; i > (k ^= i); i >>= 1);
+            }
+            k1 = j0 + k0;
+            j1 = k1 + 1;
+            k1 += nh;
+            x = a[j1];
+            a[j1] = a[k1];
+            a[k1] = x;
+            j1 += m;
+            k1 += m;
+            x = a[j1];
+            a[j1] = a[k1];
+            a[k1] = x;
+        }
+    } else {
+        j0 = 0;
+        for (k0 = 2; k0 < m; k0 += 2) {
+            for (i = nh >> 1; i > (j0 ^= i); i >>= 1);
+            k = k0;
+            for (j = j0; j < j0 + k0; j += 2) {
+                x = a[j];
+                a[j] = a[k];
+                a[k] = x;
+                j1 = j + nh;
+                k1 = k + 1;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1++;
+                k1 += nh;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                j1 -= nh;
+                k1--;
+                x = a[j1];
+                a[j1] = a[k1];
+                a[k1] = x;
+                for (i = nh >> 1; i > (k ^= i); i >>= 1);
+            }
+            k1 = j0 + k0;
+            j1 = k1 + 1;
+            k1 += nh;
+            x = a[j1];
+            a[j1] = a[k1];
+            a[k1] = x;
+        }
+    }
+}
+
+
+void cftb1st(int n, double *a)
+{
+    int i, i0, j, j0, j1, j2, j3, m, mh;
+    double ew, w1r, w1i, wk1r, wk1i, wk3r, wk3i, 
+        wd1r, wd1i, wd3r, wd3i, ss1, ss3;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = -a[1] - a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = -a[1] + a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i - x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    a[j2] = x1r + x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r - x3i;
+    a[j3 + 1] = x1i - x3r;
+    wd1r = 1;
+    wd1i = 0;
+    wd3r = 1;
+    wd3i = 0;
+    ew = M_PI_2 / m;
+    w1r = cos(2 * ew);
+    w1i = sin(2 * ew);
+    wk1r = w1r;
+    wk1i = w1i;
+    ss1 = 2 * w1i;
+    wk3i = 2 * ss1 * wk1r;
+    wk3r = wk1r - wk3i * wk1i;
+    wk3i = wk1i - wk3i * wk1r;
+    ss3 = 2 * wk3i;
+    i = 0;
+    for (;;) {
+        i0 = i + 4 * CDFT_LOOP_DIV;
+        if (i0 > mh - 4) {
+            i0 = mh - 4;
+        }
+        for (j = i + 2; j < i0; j += 4) {
+            wd1r -= ss1 * wk1i;
+            wd1i += ss1 * wk1r;
+            wd3r -= ss3 * wk3i;
+            wd3i += ss3 * wk3r;
+            j1 = j + m;
+            j2 = j1 + m;
+            j3 = j2 + m;
+            x0r = a[j] + a[j2];
+            x0i = -a[j + 1] - a[j2 + 1];
+            x1r = a[j] - a[j2];
+            x1i = -a[j + 1] + a[j2 + 1];
+            x2r = a[j1] + a[j3];
+            x2i = a[j1 + 1] + a[j3 + 1];
+            x3r = a[j1] - a[j3];
+            x3i = a[j1 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i - x2i;
+            a[j1] = x0r - x2r;
+            a[j1 + 1] = x0i + x2i;
+            x0r = x1r + x3i;
+            x0i = x1i + x3r;
+            a[j2] = wk1r * x0r - wk1i * x0i;
+            a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3r * x0r + wk3i * x0i;
+            a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+            x0r = a[j + 2] + a[j2 + 2];
+            x0i = -a[j + 3] - a[j2 + 3];
+            x1r = a[j + 2] - a[j2 + 2];
+            x1i = -a[j + 3] + a[j2 + 3];
+            x2r = a[j1 + 2] + a[j3 + 2];
+            x2i = a[j1 + 3] + a[j3 + 3];
+            x3r = a[j1 + 2] - a[j3 + 2];
+            x3i = a[j1 + 3] - a[j3 + 3];
+            a[j + 2] = x0r + x2r;
+            a[j + 3] = x0i - x2i;
+            a[j1 + 2] = x0r - x2r;
+            a[j1 + 3] = x0i + x2i;
+            x0r = x1r + x3i;
+            x0i = x1i + x3r;
+            a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+            a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i - x3r;
+            a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+            a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+            j0 = m - j;
+            j1 = j0 + m;
+            j2 = j1 + m;
+            j3 = j2 + m;
+            x0r = a[j0] + a[j2];
+            x0i = -a[j0 + 1] - a[j2 + 1];
+            x1r = a[j0] - a[j2];
+            x1i = -a[j0 + 1] + a[j2 + 1];
+            x2r = a[j1] + a[j3];
+            x2i = a[j1 + 1] + a[j3 + 1];
+            x3r = a[j1] - a[j3];
+            x3i = a[j1 + 1] - a[j3 + 1];
+            a[j0] = x0r + x2r;
+            a[j0 + 1] = x0i - x2i;
+            a[j1] = x0r - x2r;
+            a[j1 + 1] = x0i + x2i;
+            x0r = x1r + x3i;
+            x0i = x1i + x3r;
+            a[j2] = wk1i * x0r - wk1r * x0i;
+            a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3i * x0r + wk3r * x0i;
+            a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+            x0r = a[j0 - 2] + a[j2 - 2];
+            x0i = -a[j0 - 1] - a[j2 - 1];
+            x1r = a[j0 - 2] - a[j2 - 2];
+            x1i = -a[j0 - 1] + a[j2 - 1];
+            x2r = a[j1 - 2] + a[j3 - 2];
+            x2i = a[j1 - 1] + a[j3 - 1];
+            x3r = a[j1 - 2] - a[j3 - 2];
+            x3i = a[j1 - 1] - a[j3 - 1];
+            a[j0 - 2] = x0r + x2r;
+            a[j0 - 1] = x0i - x2i;
+            a[j1 - 2] = x0r - x2r;
+            a[j1 - 1] = x0i + x2i;
+            x0r = x1r + x3i;
+            x0i = x1i + x3r;
+            a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+            a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i - x3r;
+            a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+            a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+            wk1r -= ss1 * wd1i;
+            wk1i += ss1 * wd1r;
+            wk3r -= ss3 * wd3i;
+            wk3i += ss3 * wd3r;
+        }
+        if (i0 == mh - 4) {
+            break;
+        }
+        wd1r = cos(ew * i0);
+        wd1i = sin(ew * i0);
+        wd3i = 4 * wd1i * wd1r;
+        wd3r = wd1r - wd3i * wd1i;
+        wd3i = wd1i - wd3i * wd1r;
+        wk1r = w1r * wd1r - w1i * wd1i;
+        wk1i = w1r * wd1i + w1i * wd1r;
+        wk3i = 4 * wk1i * wk1r;
+        wk3r = wk1r - wk3i * wk1i;
+        wk3i = wk1i - wk3i * wk1r;
+        i = i0;
+    }
+    wd1r = WR5000;
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0 - 2] + a[j2 - 2];
+    x0i = -a[j0 - 1] - a[j2 - 1];
+    x1r = a[j0 - 2] - a[j2 - 2];
+    x1i = -a[j0 - 1] + a[j2 - 1];
+    x2r = a[j1 - 2] + a[j3 - 2];
+    x2i = a[j1 - 1] + a[j3 - 1];
+    x3r = a[j1 - 2] - a[j3 - 2];
+    x3i = a[j1 - 1] - a[j3 - 1];
+    a[j0 - 2] = x0r + x2r;
+    a[j0 - 1] = x0i - x2i;
+    a[j1 - 2] = x0r - x2r;
+    a[j1 - 1] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+    x0r = a[j0] + a[j2];
+    x0i = -a[j0 + 1] - a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = -a[j0 + 1] + a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i - x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2] = wd1r * (x0r - x0i);
+    a[j2 + 1] = wd1r * (x0i + x0r);
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wd1r * (x0r + x0i);
+    a[j3 + 1] = -wd1r * (x0i - x0r);
+    x0r = a[j0 + 2] + a[j2 + 2];
+    x0i = -a[j0 + 3] - a[j2 + 3];
+    x1r = a[j0 + 2] - a[j2 + 2];
+    x1i = -a[j0 + 3] + a[j2 + 3];
+    x2r = a[j1 + 2] + a[j3 + 2];
+    x2i = a[j1 + 3] + a[j3 + 3];
+    x3r = a[j1 + 2] - a[j3 + 2];
+    x3i = a[j1 + 3] - a[j3 + 3];
+    a[j0 + 2] = x0r + x2r;
+    a[j0 + 3] = x0i - x2i;
+    a[j1 + 2] = x0r - x2r;
+    a[j1 + 3] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+#ifdef USE_CDFT_THREADS
+struct cdft_arg_st {
+    int n0;
+    int n;
+    double *a;
+};
+typedef struct cdft_arg_st cdft_arg_t;
+
+
+void cftrec4_th(int n, double *a)
+{
+    void *cftrec1_th(void *p);
+    void *cftrec2_th(void *p);
+    int i, idiv4, m, nthread;
+    cdft_thread_t th[4];
+    cdft_arg_t ag[4];
+    
+    nthread = 2;
+    idiv4 = 0;
+    m = n >> 1;
+    if (n > CDFT_4THREADS_BEGIN_N) {
+        nthread = 4;
+        idiv4 = 1;
+        m >>= 1;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].n0 = n;
+        ag[i].n = m;
+        ag[i].a = &a[i * m];
+        if (i != idiv4) {
+            cdft_thread_create(&th[i], cftrec1_th, &ag[i]);
+        } else {
+            cdft_thread_create(&th[i], cftrec2_th, &ag[i]);
+        }
+    }
+    for (i = 0; i < nthread; i++) {
+        cdft_thread_wait(th[i]);
+    }
+}
+
+
+void *cftrec1_th(void *p)
+{
+    int cfttree(int n, int j, int k, double *a);
+    void cftleaf(int n, int isplt, double *a);
+    void cftmdl1(int n, double *a);
+    int isplt, j, k, m, n, n0;
+    double *a;
+    
+    n0 = ((cdft_arg_t *) p)->n0;
+    n = ((cdft_arg_t *) p)->n;
+    a = ((cdft_arg_t *) p)->a;
+    m = n0;
+    while (m > 512) {
+        m >>= 2;
+        cftmdl1(m, &a[n - m]);
+    }
+    cftleaf(m, 1, &a[n - m]);
+    k = 0;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a);
+        cftleaf(m, isplt, &a[j - m]);
+    }
+    return (void *) 0;
+}
+
+
+void *cftrec2_th(void *p)
+{
+    int cfttree(int n, int j, int k, double *a);
+    void cftleaf(int n, int isplt, double *a);
+    void cftmdl2(int n, double *a);
+    int isplt, j, k, m, n, n0;
+    double *a;
+    
+    n0 = ((cdft_arg_t *) p)->n0;
+    n = ((cdft_arg_t *) p)->n;
+    a = ((cdft_arg_t *) p)->a;
+    k = 1;
+    m = n0;
+    while (m > 512) {
+        m >>= 2;
+        k <<= 2;
+        cftmdl2(m, &a[n - m]);
+    }
+    cftleaf(m, 0, &a[n - m]);
+    k >>= 1;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a);
+        cftleaf(m, isplt, &a[j - m]);
+    }
+    return (void *) 0;
+}
+#endif /* USE_CDFT_THREADS */
+
+
+void cftrec4(int n, double *a)
+{
+    int cfttree(int n, int j, int k, double *a);
+    void cftleaf(int n, int isplt, double *a);
+    void cftmdl1(int n, double *a);
+    int isplt, j, k, m;
+    
+    m = n;
+    while (m > 512) {
+        m >>= 2;
+        cftmdl1(m, &a[n - m]);
+    }
+    cftleaf(m, 1, &a[n - m]);
+    k = 0;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a);
+        cftleaf(m, isplt, &a[j - m]);
+    }
+}
+
+
+int cfttree(int n, int j, int k, double *a)
+{
+    void cftmdl1(int n, double *a);
+    void cftmdl2(int n, double *a);
+    int i, isplt, m;
+    
+    if ((k & 3) != 0) {
+        isplt = k & 1;
+        if (isplt != 0) {
+            cftmdl1(n, &a[j - n]);
+        } else {
+            cftmdl2(n, &a[j - n]);
+        }
+    } else {
+        m = n;
+        for (i = k; (i & 3) == 0; i >>= 2) {
+            m <<= 2;
+        }
+        isplt = i & 1;
+        if (isplt != 0) {
+            while (m > 128) {
+                cftmdl1(m, &a[j - m]);
+                m >>= 2;
+            }
+        } else {
+            while (m > 128) {
+                cftmdl2(m, &a[j - m]);
+                m >>= 2;
+            }
+        }
+    }
+    return isplt;
+}
+
+
+void cftleaf(int n, int isplt, double *a)
+{
+    void cftmdl1(int n, double *a);
+    void cftmdl2(int n, double *a);
+    void cftf161(double *a);
+    void cftf162(double *a);
+    void cftf081(double *a);
+    void cftf082(double *a);
+    
+    if (n == 512) {
+        cftmdl1(128, a);
+        cftf161(a);
+        cftf162(&a[32]);
+        cftf161(&a[64]);
+        cftf161(&a[96]);
+        cftmdl2(128, &a[128]);
+        cftf161(&a[128]);
+        cftf162(&a[160]);
+        cftf161(&a[192]);
+        cftf162(&a[224]);
+        cftmdl1(128, &a[256]);
+        cftf161(&a[256]);
+        cftf162(&a[288]);
+        cftf161(&a[320]);
+        cftf161(&a[352]);
+        if (isplt != 0) {
+            cftmdl1(128, &a[384]);
+            cftf161(&a[480]);
+        } else {
+            cftmdl2(128, &a[384]);
+            cftf162(&a[480]);
+        }
+        cftf161(&a[384]);
+        cftf162(&a[416]);
+        cftf161(&a[448]);
+    } else {
+        cftmdl1(64, a);
+        cftf081(a);
+        cftf082(&a[16]);
+        cftf081(&a[32]);
+        cftf081(&a[48]);
+        cftmdl2(64, &a[64]);
+        cftf081(&a[64]);
+        cftf082(&a[80]);
+        cftf081(&a[96]);
+        cftf082(&a[112]);
+        cftmdl1(64, &a[128]);
+        cftf081(&a[128]);
+        cftf082(&a[144]);
+        cftf081(&a[160]);
+        cftf081(&a[176]);
+        if (isplt != 0) {
+            cftmdl1(64, &a[192]);
+            cftf081(&a[240]);
+        } else {
+            cftmdl2(64, &a[192]);
+            cftf082(&a[240]);
+        }
+        cftf081(&a[192]);
+        cftf082(&a[208]);
+        cftf081(&a[224]);
+    }
+}
+
+
+void cftmdl1(int n, double *a)
+{
+    int i, i0, j, j0, j1, j2, j3, m, mh;
+    double ew, w1r, w1i, wk1r, wk1i, wk3r, wk3i, 
+        wd1r, wd1i, wd3r, wd3i, ss1, ss3;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = a[1] + a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = a[1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    a[j2] = x1r - x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+    wd1r = 1;
+    wd1i = 0;
+    wd3r = 1;
+    wd3i = 0;
+    ew = M_PI_2 / m;
+    w1r = cos(2 * ew);
+    w1i = sin(2 * ew);
+    wk1r = w1r;
+    wk1i = w1i;
+    ss1 = 2 * w1i;
+    wk3i = 2 * ss1 * wk1r;
+    wk3r = wk1r - wk3i * wk1i;
+    wk3i = wk1i - wk3i * wk1r;
+    ss3 = 2 * wk3i;
+    i = 0;
+    for (;;) {
+        i0 = i + 4 * CDFT_LOOP_DIV;
+        if (i0 > mh - 4) {
+            i0 = mh - 4;
+        }
+        for (j = i + 2; j < i0; j += 4) {
+            wd1r -= ss1 * wk1i;
+            wd1i += ss1 * wk1r;
+            wd3r -= ss3 * wk3i;
+            wd3i += ss3 * wk3r;
+            j1 = j + m;
+            j2 = j1 + m;
+            j3 = j2 + m;
+            x0r = a[j] + a[j2];
+            x0i = a[j + 1] + a[j2 + 1];
+            x1r = a[j] - a[j2];
+            x1i = a[j + 1] - a[j2 + 1];
+            x2r = a[j1] + a[j3];
+            x2i = a[j1 + 1] + a[j3 + 1];
+            x3r = a[j1] - a[j3];
+            x3i = a[j1 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            a[j1] = x0r - x2r;
+            a[j1 + 1] = x0i - x2i;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j2] = wk1r * x0r - wk1i * x0i;
+            a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3r * x0r + wk3i * x0i;
+            a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+            x0r = a[j + 2] + a[j2 + 2];
+            x0i = a[j + 3] + a[j2 + 3];
+            x1r = a[j + 2] - a[j2 + 2];
+            x1i = a[j + 3] - a[j2 + 3];
+            x2r = a[j1 + 2] + a[j3 + 2];
+            x2i = a[j1 + 3] + a[j3 + 3];
+            x3r = a[j1 + 2] - a[j3 + 2];
+            x3i = a[j1 + 3] - a[j3 + 3];
+            a[j + 2] = x0r + x2r;
+            a[j + 3] = x0i + x2i;
+            a[j1 + 2] = x0r - x2r;
+            a[j1 + 3] = x0i - x2i;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+            a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+            a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+            j0 = m - j;
+            j1 = j0 + m;
+            j2 = j1 + m;
+            j3 = j2 + m;
+            x0r = a[j0] + a[j2];
+            x0i = a[j0 + 1] + a[j2 + 1];
+            x1r = a[j0] - a[j2];
+            x1i = a[j0 + 1] - a[j2 + 1];
+            x2r = a[j1] + a[j3];
+            x2i = a[j1 + 1] + a[j3 + 1];
+            x3r = a[j1] - a[j3];
+            x3i = a[j1 + 1] - a[j3 + 1];
+            a[j0] = x0r + x2r;
+            a[j0 + 1] = x0i + x2i;
+            a[j1] = x0r - x2r;
+            a[j1 + 1] = x0i - x2i;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j2] = wk1i * x0r - wk1r * x0i;
+            a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3i * x0r + wk3r * x0i;
+            a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+            x0r = a[j0 - 2] + a[j2 - 2];
+            x0i = a[j0 - 1] + a[j2 - 1];
+            x1r = a[j0 - 2] - a[j2 - 2];
+            x1i = a[j0 - 1] - a[j2 - 1];
+            x2r = a[j1 - 2] + a[j3 - 2];
+            x2i = a[j1 - 1] + a[j3 - 1];
+            x3r = a[j1 - 2] - a[j3 - 2];
+            x3i = a[j1 - 1] - a[j3 - 1];
+            a[j0 - 2] = x0r + x2r;
+            a[j0 - 1] = x0i + x2i;
+            a[j1 - 2] = x0r - x2r;
+            a[j1 - 1] = x0i - x2i;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+            a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+            a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+            wk1r -= ss1 * wd1i;
+            wk1i += ss1 * wd1r;
+            wk3r -= ss3 * wd3i;
+            wk3i += ss3 * wd3r;
+        }
+        if (i0 == mh - 4) {
+            break;
+        }
+        wd1r = cos(ew * i0);
+        wd1i = sin(ew * i0);
+        wd3i = 4 * wd1i * wd1r;
+        wd3r = wd1r - wd3i * wd1i;
+        wd3i = wd1i - wd3i * wd1r;
+        wk1r = w1r * wd1r - w1i * wd1i;
+        wk1i = w1r * wd1i + w1i * wd1r;
+        wk3i = 4 * wk1i * wk1r;
+        wk3r = wk1r - wk3i * wk1i;
+        wk3i = wk1i - wk3i * wk1r;
+        i = i0;
+    }
+    wd1r = WR5000;
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0 - 2] + a[j2 - 2];
+    x0i = a[j0 - 1] + a[j2 - 1];
+    x1r = a[j0 - 2] - a[j2 - 2];
+    x1i = a[j0 - 1] - a[j2 - 1];
+    x2r = a[j1 - 2] + a[j3 - 2];
+    x2i = a[j1 - 1] + a[j3 - 1];
+    x3r = a[j1 - 2] - a[j3 - 2];
+    x3i = a[j1 - 1] - a[j3 - 1];
+    a[j0 - 2] = x0r + x2r;
+    a[j0 - 1] = x0i + x2i;
+    a[j1 - 2] = x0r - x2r;
+    a[j1 - 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+    x0r = a[j0] + a[j2];
+    x0i = a[j0 + 1] + a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = a[j0 + 1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wd1r * (x0r - x0i);
+    a[j2 + 1] = wd1r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wd1r * (x0r + x0i);
+    a[j3 + 1] = -wd1r * (x0i - x0r);
+    x0r = a[j0 + 2] + a[j2 + 2];
+    x0i = a[j0 + 3] + a[j2 + 3];
+    x1r = a[j0 + 2] - a[j2 + 2];
+    x1i = a[j0 + 3] - a[j2 + 3];
+    x2r = a[j1 + 2] + a[j3 + 2];
+    x2i = a[j1 + 3] + a[j3 + 3];
+    x3r = a[j1 + 2] - a[j3 + 2];
+    x3i = a[j1 + 3] - a[j3 + 3];
+    a[j0 + 2] = x0r + x2r;
+    a[j0 + 3] = x0i + x2i;
+    a[j1 + 2] = x0r - x2r;
+    a[j1 + 3] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+void cftmdl2(int n, double *a)
+{
+    int i, i0, j, j0, j1, j2, j3, m, mh;
+    double ew, w1r, w1i, wn4r, wk1r, wk1i, wk3r, wk3i, 
+        wl1r, wl1i, wl3r, wl3i, wd1r, wd1i, wd3r, wd3i, 
+        we1r, we1i, we3r, we3i, ss1, ss3;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i;
+    
+    mh = n >> 3;
+    m = 2 * mh;
+    wn4r = WR5000;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] - a[j2 + 1];
+    x0i = a[1] + a[j2];
+    x1r = a[0] + a[j2 + 1];
+    x1i = a[1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wn4r * (x2r - x2i);
+    y0i = wn4r * (x2i + x2r);
+    a[0] = x0r + y0r;
+    a[1] = x0i + y0i;
+    a[j1] = x0r - y0r;
+    a[j1 + 1] = x0i - y0i;
+    y0r = wn4r * (x3r - x3i);
+    y0i = wn4r * (x3i + x3r);
+    a[j2] = x1r - y0i;
+    a[j2 + 1] = x1i + y0r;
+    a[j3] = x1r + y0i;
+    a[j3 + 1] = x1i - y0r;
+    wl1r = 1;
+    wl1i = 0;
+    wl3r = 1;
+    wl3i = 0;
+    we1r = wn4r;
+    we1i = wn4r;
+    we3r = -wn4r;
+    we3i = -wn4r;
+    ew = M_PI_2 / (2 * m);
+    w1r = cos(2 * ew);
+    w1i = sin(2 * ew);
+    wk1r = w1r;
+    wk1i = w1i;
+    wd1r = wn4r * (w1r - w1i);
+    wd1i = wn4r * (w1i + w1r);
+    ss1 = 2 * w1i;
+    wk3i = 2 * ss1 * wk1r;
+    wk3r = wk1r - wk3i * wk1i;
+    wk3i = wk1i - wk3i * wk1r;
+    ss3 = 2 * wk3i;
+    wd3r = -wn4r * (wk3r - wk3i);
+    wd3i = -wn4r * (wk3i + wk3r);
+    i = 0;
+    for (;;) {
+        i0 = i + 4 * CDFT_LOOP_DIV;
+        if (i0 > mh - 4) {
+            i0 = mh - 4;
+        }
+        for (j = i + 2; j < i0; j += 4) {
+            wl1r -= ss1 * wk1i;
+            wl1i += ss1 * wk1r;
+            wl3r -= ss3 * wk3i;
+            wl3i += ss3 * wk3r;
+            we1r -= ss1 * wd1i;
+            we1i += ss1 * wd1r;
+            we3r -= ss3 * wd3i;
+            we3i += ss3 * wd3r;
+            j1 = j + m;
+            j2 = j1 + m;
+            j3 = j2 + m;
+            x0r = a[j] - a[j2 + 1];
+            x0i = a[j + 1] + a[j2];
+            x1r = a[j] + a[j2 + 1];
+            x1i = a[j + 1] - a[j2];
+            x2r = a[j1] - a[j3 + 1];
+            x2i = a[j1 + 1] + a[j3];
+            x3r = a[j1] + a[j3 + 1];
+            x3i = a[j1 + 1] - a[j3];
+            y0r = wk1r * x0r - wk1i * x0i;
+            y0i = wk1r * x0i + wk1i * x0r;
+            y2r = wd1r * x2r - wd1i * x2i;
+            y2i = wd1r * x2i + wd1i * x2r;
+            a[j] = y0r + y2r;
+            a[j + 1] = y0i + y2i;
+            a[j1] = y0r - y2r;
+            a[j1 + 1] = y0i - y2i;
+            y0r = wk3r * x1r + wk3i * x1i;
+            y0i = wk3r * x1i - wk3i * x1r;
+            y2r = wd3r * x3r + wd3i * x3i;
+            y2i = wd3r * x3i - wd3i * x3r;
+            a[j2] = y0r + y2r;
+            a[j2 + 1] = y0i + y2i;
+            a[j3] = y0r - y2r;
+            a[j3 + 1] = y0i - y2i;
+            x0r = a[j + 2] - a[j2 + 3];
+            x0i = a[j + 3] + a[j2 + 2];
+            x1r = a[j + 2] + a[j2 + 3];
+            x1i = a[j + 3] - a[j2 + 2];
+            x2r = a[j1 + 2] - a[j3 + 3];
+            x2i = a[j1 + 3] + a[j3 + 2];
+            x3r = a[j1 + 2] + a[j3 + 3];
+            x3i = a[j1 + 3] - a[j3 + 2];
+            y0r = wl1r * x0r - wl1i * x0i;
+            y0i = wl1r * x0i + wl1i * x0r;
+            y2r = we1r * x2r - we1i * x2i;
+            y2i = we1r * x2i + we1i * x2r;
+            a[j + 2] = y0r + y2r;
+            a[j + 3] = y0i + y2i;
+            a[j1 + 2] = y0r - y2r;
+            a[j1 + 3] = y0i - y2i;
+            y0r = wl3r * x1r + wl3i * x1i;
+            y0i = wl3r * x1i - wl3i * x1r;
+            y2r = we3r * x3r + we3i * x3i;
+            y2i = we3r * x3i - we3i * x3r;
+            a[j2 + 2] = y0r + y2r;
+            a[j2 + 3] = y0i + y2i;
+            a[j3 + 2] = y0r - y2r;
+            a[j3 + 3] = y0i - y2i;
+            j0 = m - j;
+            j1 = j0 + m;
+            j2 = j1 + m;
+            j3 = j2 + m;
+            x0r = a[j0] - a[j2 + 1];
+            x0i = a[j0 + 1] + a[j2];
+            x1r = a[j0] + a[j2 + 1];
+            x1i = a[j0 + 1] - a[j2];
+            x2r = a[j1] - a[j3 + 1];
+            x2i = a[j1 + 1] + a[j3];
+            x3r = a[j1] + a[j3 + 1];
+            x3i = a[j1 + 1] - a[j3];
+            y0r = wd1i * x0r - wd1r * x0i;
+            y0i = wd1i * x0i + wd1r * x0r;
+            y2r = wk1i * x2r - wk1r * x2i;
+            y2i = wk1i * x2i + wk1r * x2r;
+            a[j0] = y0r + y2r;
+            a[j0 + 1] = y0i + y2i;
+            a[j1] = y0r - y2r;
+            a[j1 + 1] = y0i - y2i;
+            y0r = wd3i * x1r + wd3r * x1i;
+            y0i = wd3i * x1i - wd3r * x1r;
+            y2r = wk3i * x3r + wk3r * x3i;
+            y2i = wk3i * x3i - wk3r * x3r;
+            a[j2] = y0r + y2r;
+            a[j2 + 1] = y0i + y2i;
+            a[j3] = y0r - y2r;
+            a[j3 + 1] = y0i - y2i;
+            x0r = a[j0 - 2] - a[j2 - 1];
+            x0i = a[j0 - 1] + a[j2 - 2];
+            x1r = a[j0 - 2] + a[j2 - 1];
+            x1i = a[j0 - 1] - a[j2 - 2];
+            x2r = a[j1 - 2] - a[j3 - 1];
+            x2i = a[j1 - 1] + a[j3 - 2];
+            x3r = a[j1 - 2] + a[j3 - 1];
+            x3i = a[j1 - 1] - a[j3 - 2];
+            y0r = we1i * x0r - we1r * x0i;
+            y0i = we1i * x0i + we1r * x0r;
+            y2r = wl1i * x2r - wl1r * x2i;
+            y2i = wl1i * x2i + wl1r * x2r;
+            a[j0 - 2] = y0r + y2r;
+            a[j0 - 1] = y0i + y2i;
+            a[j1 - 2] = y0r - y2r;
+            a[j1 - 1] = y0i - y2i;
+            y0r = we3i * x1r + we3r * x1i;
+            y0i = we3i * x1i - we3r * x1r;
+            y2r = wl3i * x3r + wl3r * x3i;
+            y2i = wl3i * x3i - wl3r * x3r;
+            a[j2 - 2] = y0r + y2r;
+            a[j2 - 1] = y0i + y2i;
+            a[j3 - 2] = y0r - y2r;
+            a[j3 - 1] = y0i - y2i;
+            wk1r -= ss1 * wl1i;
+            wk1i += ss1 * wl1r;
+            wk3r -= ss3 * wl3i;
+            wk3i += ss3 * wl3r;
+            wd1r -= ss1 * we1i;
+            wd1i += ss1 * we1r;
+            wd3r -= ss3 * we3i;
+            wd3i += ss3 * we3r;
+        }
+        if (i0 == mh - 4) {
+            break;
+        }
+        wl1r = cos(ew * i0);
+        wl1i = sin(ew * i0);
+        wl3i = 4 * wl1i * wl1r;
+        wl3r = wl1r - wl3i * wl1i;
+        wl3i = wl1i - wl3i * wl1r;
+        we1r = wn4r * (wl1r - wl1i);
+        we1i = wn4r * (wl1i + wl1r);
+        we3r = -wn4r * (wl3r - wl3i);
+        we3i = -wn4r * (wl3i + wl3r);
+        wk1r = w1r * wl1r - w1i * wl1i;
+        wk1i = w1r * wl1i + w1i * wl1r;
+        wk3i = 4 * wk1i * wk1r;
+        wk3r = wk1r - wk3i * wk1i;
+        wk3i = wk1i - wk3i * wk1r;
+        wd1r = wn4r * (wk1r - wk1i);
+        wd1i = wn4r * (wk1i + wk1r);
+        wd3r = -wn4r * (wk3r - wk3i);
+        wd3i = -wn4r * (wk3i + wk3r);
+        i = i0;
+    }
+    wl1r = WR2500;
+    wl1i = WI2500;
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0 - 2] - a[j2 - 1];
+    x0i = a[j0 - 1] + a[j2 - 2];
+    x1r = a[j0 - 2] + a[j2 - 1];
+    x1i = a[j0 - 1] - a[j2 - 2];
+    x2r = a[j1 - 2] - a[j3 - 1];
+    x2i = a[j1 - 1] + a[j3 - 2];
+    x3r = a[j1 - 2] + a[j3 - 1];
+    x3i = a[j1 - 1] - a[j3 - 2];
+    y0r = wk1r * x0r - wk1i * x0i;
+    y0i = wk1r * x0i + wk1i * x0r;
+    y2r = wd1r * x2r - wd1i * x2i;
+    y2i = wd1r * x2i + wd1i * x2r;
+    a[j0 - 2] = y0r + y2r;
+    a[j0 - 1] = y0i + y2i;
+    a[j1 - 2] = y0r - y2r;
+    a[j1 - 1] = y0i - y2i;
+    y0r = wk3r * x1r + wk3i * x1i;
+    y0i = wk3r * x1i - wk3i * x1r;
+    y2r = wd3r * x3r + wd3i * x3i;
+    y2i = wd3r * x3i - wd3i * x3r;
+    a[j2 - 2] = y0r + y2r;
+    a[j2 - 1] = y0i + y2i;
+    a[j3 - 2] = y0r - y2r;
+    a[j3 - 1] = y0i - y2i;
+    x0r = a[j0] - a[j2 + 1];
+    x0i = a[j0 + 1] + a[j2];
+    x1r = a[j0] + a[j2 + 1];
+    x1i = a[j0 + 1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wl1r * x0r - wl1i * x0i;
+    y0i = wl1r * x0i + wl1i * x0r;
+    y2r = wl1i * x2r - wl1r * x2i;
+    y2i = wl1i * x2i + wl1r * x2r;
+    a[j0] = y0r + y2r;
+    a[j0 + 1] = y0i + y2i;
+    a[j1] = y0r - y2r;
+    a[j1 + 1] = y0i - y2i;
+    y0r = wl1i * x1r - wl1r * x1i;
+    y0i = wl1i * x1i + wl1r * x1r;
+    y2r = wl1r * x3r - wl1i * x3i;
+    y2i = wl1r * x3i + wl1i * x3r;
+    a[j2] = y0r - y2r;
+    a[j2 + 1] = y0i - y2i;
+    a[j3] = y0r + y2r;
+    a[j3 + 1] = y0i + y2i;
+    x0r = a[j0 + 2] - a[j2 + 3];
+    x0i = a[j0 + 3] + a[j2 + 2];
+    x1r = a[j0 + 2] + a[j2 + 3];
+    x1i = a[j0 + 3] - a[j2 + 2];
+    x2r = a[j1 + 2] - a[j3 + 3];
+    x2i = a[j1 + 3] + a[j3 + 2];
+    x3r = a[j1 + 2] + a[j3 + 3];
+    x3i = a[j1 + 3] - a[j3 + 2];
+    y0r = wd1i * x0r - wd1r * x0i;
+    y0i = wd1i * x0i + wd1r * x0r;
+    y2r = wk1i * x2r - wk1r * x2i;
+    y2i = wk1i * x2i + wk1r * x2r;
+    a[j0 + 2] = y0r + y2r;
+    a[j0 + 3] = y0i + y2i;
+    a[j1 + 2] = y0r - y2r;
+    a[j1 + 3] = y0i - y2i;
+    y0r = wd3i * x1r + wd3r * x1i;
+    y0i = wd3i * x1i - wd3r * x1r;
+    y2r = wk3i * x3r + wk3r * x3i;
+    y2i = wk3i * x3i - wk3r * x3r;
+    a[j2 + 2] = y0r + y2r;
+    a[j2 + 3] = y0i + y2i;
+    a[j3 + 2] = y0r - y2r;
+    a[j3 + 3] = y0i - y2i;
+}
+
+
+void cftfx41(int n, double *a)
+{
+    void cftf161(double *a);
+    void cftf162(double *a);
+    void cftf081(double *a);
+    void cftf082(double *a);
+    
+    if (n == 128) {
+        cftf161(a);
+        cftf162(&a[32]);
+        cftf161(&a[64]);
+        cftf161(&a[96]);
+    } else {
+        cftf081(a);
+        cftf082(&a[16]);
+        cftf081(&a[32]);
+        cftf081(&a[48]);
+    }
+}
+
+
+void cftf161(double *a)
+{
+    double wn4r, wk1r, wk1i, 
+        x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, 
+        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, 
+        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+    
+    wn4r = WR5000;
+    wk1r = WR2500;
+    wk1i = WI2500;
+    x0r = a[0] + a[16];
+    x0i = a[1] + a[17];
+    x1r = a[0] - a[16];
+    x1i = a[1] - a[17];
+    x2r = a[8] + a[24];
+    x2i = a[9] + a[25];
+    x3r = a[8] - a[24];
+    x3i = a[9] - a[25];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y4r = x0r - x2r;
+    y4i = x0i - x2i;
+    y8r = x1r - x3i;
+    y8i = x1i + x3r;
+    y12r = x1r + x3i;
+    y12i = x1i - x3r;
+    x0r = a[2] + a[18];
+    x0i = a[3] + a[19];
+    x1r = a[2] - a[18];
+    x1i = a[3] - a[19];
+    x2r = a[10] + a[26];
+    x2i = a[11] + a[27];
+    x3r = a[10] - a[26];
+    x3i = a[11] - a[27];
+    y1r = x0r + x2r;
+    y1i = x0i + x2i;
+    y5r = x0r - x2r;
+    y5i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y9r = wk1r * x0r - wk1i * x0i;
+    y9i = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y13r = wk1i * x0r - wk1r * x0i;
+    y13i = wk1i * x0i + wk1r * x0r;
+    x0r = a[4] + a[20];
+    x0i = a[5] + a[21];
+    x1r = a[4] - a[20];
+    x1i = a[5] - a[21];
+    x2r = a[12] + a[28];
+    x2i = a[13] + a[29];
+    x3r = a[12] - a[28];
+    x3i = a[13] - a[29];
+    y2r = x0r + x2r;
+    y2i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y10r = wn4r * (x0r - x0i);
+    y10i = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y14r = wn4r * (x0r + x0i);
+    y14i = wn4r * (x0i - x0r);
+    x0r = a[6] + a[22];
+    x0i = a[7] + a[23];
+    x1r = a[6] - a[22];
+    x1i = a[7] - a[23];
+    x2r = a[14] + a[30];
+    x2i = a[15] + a[31];
+    x3r = a[14] - a[30];
+    x3i = a[15] - a[31];
+    y3r = x0r + x2r;
+    y3i = x0i + x2i;
+    y7r = x0r - x2r;
+    y7i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y11r = wk1i * x0r - wk1r * x0i;
+    y11i = wk1i * x0i + wk1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y15r = wk1r * x0r - wk1i * x0i;
+    y15i = wk1r * x0i + wk1i * x0r;
+    x0r = y12r - y14r;
+    x0i = y12i - y14i;
+    x1r = y12r + y14r;
+    x1i = y12i + y14i;
+    x2r = y13r - y15r;
+    x2i = y13i - y15i;
+    x3r = y13r + y15r;
+    x3i = y13i + y15i;
+    a[24] = x0r + x2r;
+    a[25] = x0i + x2i;
+    a[26] = x0r - x2r;
+    a[27] = x0i - x2i;
+    a[28] = x1r - x3i;
+    a[29] = x1i + x3r;
+    a[30] = x1r + x3i;
+    a[31] = x1i - x3r;
+    x0r = y8r + y10r;
+    x0i = y8i + y10i;
+    x1r = y8r - y10r;
+    x1i = y8i - y10i;
+    x2r = y9r + y11r;
+    x2i = y9i + y11i;
+    x3r = y9r - y11r;
+    x3i = y9i - y11i;
+    a[16] = x0r + x2r;
+    a[17] = x0i + x2i;
+    a[18] = x0r - x2r;
+    a[19] = x0i - x2i;
+    a[20] = x1r - x3i;
+    a[21] = x1i + x3r;
+    a[22] = x1r + x3i;
+    a[23] = x1i - x3r;
+    x0r = y5r - y7i;
+    x0i = y5i + y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    x0r = y5r + y7i;
+    x0i = y5i - y7r;
+    x3r = wn4r * (x0r - x0i);
+    x3i = wn4r * (x0i + x0r);
+    x0r = y4r - y6i;
+    x0i = y4i + y6r;
+    x1r = y4r + y6i;
+    x1i = y4i - y6r;
+    a[8] = x0r + x2r;
+    a[9] = x0i + x2i;
+    a[10] = x0r - x2r;
+    a[11] = x0i - x2i;
+    a[12] = x1r - x3i;
+    a[13] = x1i + x3r;
+    a[14] = x1r + x3i;
+    a[15] = x1i - x3r;
+    x0r = y0r + y2r;
+    x0i = y0i + y2i;
+    x1r = y0r - y2r;
+    x1i = y0i - y2i;
+    x2r = y1r + y3r;
+    x2i = y1i + y3i;
+    x3r = y1r - y3r;
+    x3i = y1i - y3i;
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x0r - x2r;
+    a[3] = x0i - x2i;
+    a[4] = x1r - x3i;
+    a[5] = x1i + x3r;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+}
+
+
+void cftf162(double *a)
+{
+    double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, 
+        x0r, x0i, x1r, x1i, x2r, x2i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, 
+        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, 
+        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+    
+    wn4r = WR5000;
+    wk1r = WR1250;
+    wk1i = WI1250;
+    wk2r = WR2500;
+    wk2i = WI2500;
+    wk3r = WR3750;
+    wk3i = WI3750;
+    x1r = a[0] - a[17];
+    x1i = a[1] + a[16];
+    x0r = a[8] - a[25];
+    x0i = a[9] + a[24];
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    y0r = x1r + x2r;
+    y0i = x1i + x2i;
+    y4r = x1r - x2r;
+    y4i = x1i - x2i;
+    x1r = a[0] + a[17];
+    x1i = a[1] - a[16];
+    x0r = a[8] + a[25];
+    x0i = a[9] - a[24];
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    y8r = x1r - x2i;
+    y8i = x1i + x2r;
+    y12r = x1r + x2i;
+    y12i = x1i - x2r;
+    x0r = a[2] - a[19];
+    x0i = a[3] + a[18];
+    x1r = wk1r * x0r - wk1i * x0i;
+    x1i = wk1r * x0i + wk1i * x0r;
+    x0r = a[10] - a[27];
+    x0i = a[11] + a[26];
+    x2r = wk3i * x0r - wk3r * x0i;
+    x2i = wk3i * x0i + wk3r * x0r;
+    y1r = x1r + x2r;
+    y1i = x1i + x2i;
+    y5r = x1r - x2r;
+    y5i = x1i - x2i;
+    x0r = a[2] + a[19];
+    x0i = a[3] - a[18];
+    x1r = wk3r * x0r - wk3i * x0i;
+    x1i = wk3r * x0i + wk3i * x0r;
+    x0r = a[10] + a[27];
+    x0i = a[11] - a[26];
+    x2r = wk1r * x0r + wk1i * x0i;
+    x2i = wk1r * x0i - wk1i * x0r;
+    y9r = x1r - x2r;
+    y9i = x1i - x2i;
+    y13r = x1r + x2r;
+    y13i = x1i + x2i;
+    x0r = a[4] - a[21];
+    x0i = a[5] + a[20];
+    x1r = wk2r * x0r - wk2i * x0i;
+    x1i = wk2r * x0i + wk2i * x0r;
+    x0r = a[12] - a[29];
+    x0i = a[13] + a[28];
+    x2r = wk2i * x0r - wk2r * x0i;
+    x2i = wk2i * x0i + wk2r * x0r;
+    y2r = x1r + x2r;
+    y2i = x1i + x2i;
+    y6r = x1r - x2r;
+    y6i = x1i - x2i;
+    x0r = a[4] + a[21];
+    x0i = a[5] - a[20];
+    x1r = wk2i * x0r - wk2r * x0i;
+    x1i = wk2i * x0i + wk2r * x0r;
+    x0r = a[12] + a[29];
+    x0i = a[13] - a[28];
+    x2r = wk2r * x0r - wk2i * x0i;
+    x2i = wk2r * x0i + wk2i * x0r;
+    y10r = x1r - x2r;
+    y10i = x1i - x2i;
+    y14r = x1r + x2r;
+    y14i = x1i + x2i;
+    x0r = a[6] - a[23];
+    x0i = a[7] + a[22];
+    x1r = wk3r * x0r - wk3i * x0i;
+    x1i = wk3r * x0i + wk3i * x0r;
+    x0r = a[14] - a[31];
+    x0i = a[15] + a[30];
+    x2r = wk1i * x0r - wk1r * x0i;
+    x2i = wk1i * x0i + wk1r * x0r;
+    y3r = x1r + x2r;
+    y3i = x1i + x2i;
+    y7r = x1r - x2r;
+    y7i = x1i - x2i;
+    x0r = a[6] + a[23];
+    x0i = a[7] - a[22];
+    x1r = wk1i * x0r + wk1r * x0i;
+    x1i = wk1i * x0i - wk1r * x0r;
+    x0r = a[14] + a[31];
+    x0i = a[15] - a[30];
+    x2r = wk3i * x0r - wk3r * x0i;
+    x2i = wk3i * x0i + wk3r * x0r;
+    y11r = x1r + x2r;
+    y11i = x1i + x2i;
+    y15r = x1r - x2r;
+    y15i = x1i - x2i;
+    x1r = y0r + y2r;
+    x1i = y0i + y2i;
+    x2r = y1r + y3r;
+    x2i = y1i + y3i;
+    a[0] = x1r + x2r;
+    a[1] = x1i + x2i;
+    a[2] = x1r - x2r;
+    a[3] = x1i - x2i;
+    x1r = y0r - y2r;
+    x1i = y0i - y2i;
+    x2r = y1r - y3r;
+    x2i = y1i - y3i;
+    a[4] = x1r - x2i;
+    a[5] = x1i + x2r;
+    a[6] = x1r + x2i;
+    a[7] = x1i - x2r;
+    x1r = y4r - y6i;
+    x1i = y4i + y6r;
+    x0r = y5r - y7i;
+    x0i = y5i + y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[8] = x1r + x2r;
+    a[9] = x1i + x2i;
+    a[10] = x1r - x2r;
+    a[11] = x1i - x2i;
+    x1r = y4r + y6i;
+    x1i = y4i - y6r;
+    x0r = y5r + y7i;
+    x0i = y5i - y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[12] = x1r - x2i;
+    a[13] = x1i + x2r;
+    a[14] = x1r + x2i;
+    a[15] = x1i - x2r;
+    x1r = y8r + y10r;
+    x1i = y8i + y10i;
+    x2r = y9r - y11r;
+    x2i = y9i - y11i;
+    a[16] = x1r + x2r;
+    a[17] = x1i + x2i;
+    a[18] = x1r - x2r;
+    a[19] = x1i - x2i;
+    x1r = y8r - y10r;
+    x1i = y8i - y10i;
+    x2r = y9r + y11r;
+    x2i = y9i + y11i;
+    a[20] = x1r - x2i;
+    a[21] = x1i + x2r;
+    a[22] = x1r + x2i;
+    a[23] = x1i - x2r;
+    x1r = y12r - y14i;
+    x1i = y12i + y14r;
+    x0r = y13r + y15i;
+    x0i = y13i - y15r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[24] = x1r + x2r;
+    a[25] = x1i + x2i;
+    a[26] = x1r - x2r;
+    a[27] = x1i - x2i;
+    x1r = y12r + y14i;
+    x1i = y12i - y14r;
+    x0r = y13r - y15i;
+    x0i = y13i + y15r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[28] = x1r - x2i;
+    a[29] = x1i + x2r;
+    a[30] = x1r + x2i;
+    a[31] = x1i - x2r;
+}
+
+
+void cftf081(double *a)
+{
+    double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    wn4r = WR5000;
+    x0r = a[0] + a[8];
+    x0i = a[1] + a[9];
+    x1r = a[0] - a[8];
+    x1i = a[1] - a[9];
+    x2r = a[4] + a[12];
+    x2i = a[5] + a[13];
+    x3r = a[4] - a[12];
+    x3i = a[5] - a[13];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y2r = x0r - x2r;
+    y2i = x0i - x2i;
+    y1r = x1r - x3i;
+    y1i = x1i + x3r;
+    y3r = x1r + x3i;
+    y3i = x1i - x3r;
+    x0r = a[2] + a[10];
+    x0i = a[3] + a[11];
+    x1r = a[2] - a[10];
+    x1i = a[3] - a[11];
+    x2r = a[6] + a[14];
+    x2i = a[7] + a[15];
+    x3r = a[6] - a[14];
+    x3i = a[7] - a[15];
+    y4r = x0r + x2r;
+    y4i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    x2r = x1r + x3i;
+    x2i = x1i - x3r;
+    y5r = wn4r * (x0r - x0i);
+    y5i = wn4r * (x0r + x0i);
+    y7r = wn4r * (x2r - x2i);
+    y7i = wn4r * (x2r + x2i);
+    a[8] = y1r + y5r;
+    a[9] = y1i + y5i;
+    a[10] = y1r - y5r;
+    a[11] = y1i - y5i;
+    a[12] = y3r - y7i;
+    a[13] = y3i + y7r;
+    a[14] = y3r + y7i;
+    a[15] = y3i - y7r;
+    a[0] = y0r + y4r;
+    a[1] = y0i + y4i;
+    a[2] = y0r - y4r;
+    a[3] = y0i - y4i;
+    a[4] = y2r - y6i;
+    a[5] = y2i + y6r;
+    a[6] = y2r + y6i;
+    a[7] = y2i - y6r;
+}
+
+
+void cftf082(double *a)
+{
+    double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    wn4r = WR5000;
+    wk1r = WR2500;
+    wk1i = WI2500;
+    y0r = a[0] - a[9];
+    y0i = a[1] + a[8];
+    y1r = a[0] + a[9];
+    y1i = a[1] - a[8];
+    x0r = a[4] - a[13];
+    x0i = a[5] + a[12];
+    y2r = wn4r * (x0r - x0i);
+    y2i = wn4r * (x0i + x0r);
+    x0r = a[4] + a[13];
+    x0i = a[5] - a[12];
+    y3r = wn4r * (x0r - x0i);
+    y3i = wn4r * (x0i + x0r);
+    x0r = a[2] - a[11];
+    x0i = a[3] + a[10];
+    y4r = wk1r * x0r - wk1i * x0i;
+    y4i = wk1r * x0i + wk1i * x0r;
+    x0r = a[2] + a[11];
+    x0i = a[3] - a[10];
+    y5r = wk1i * x0r - wk1r * x0i;
+    y5i = wk1i * x0i + wk1r * x0r;
+    x0r = a[6] - a[15];
+    x0i = a[7] + a[14];
+    y6r = wk1i * x0r - wk1r * x0i;
+    y6i = wk1i * x0i + wk1r * x0r;
+    x0r = a[6] + a[15];
+    x0i = a[7] - a[14];
+    y7r = wk1r * x0r - wk1i * x0i;
+    y7i = wk1r * x0i + wk1i * x0r;
+    x0r = y0r + y2r;
+    x0i = y0i + y2i;
+    x1r = y4r + y6r;
+    x1i = y4i + y6i;
+    a[0] = x0r + x1r;
+    a[1] = x0i + x1i;
+    a[2] = x0r - x1r;
+    a[3] = x0i - x1i;
+    x0r = y0r - y2r;
+    x0i = y0i - y2i;
+    x1r = y4r - y6r;
+    x1i = y4i - y6i;
+    a[4] = x0r - x1i;
+    a[5] = x0i + x1r;
+    a[6] = x0r + x1i;
+    a[7] = x0i - x1r;
+    x0r = y1r - y3i;
+    x0i = y1i + y3r;
+    x1r = y5r - y7r;
+    x1i = y5i - y7i;
+    a[8] = x0r + x1r;
+    a[9] = x0i + x1i;
+    a[10] = x0r - x1r;
+    a[11] = x0i - x1i;
+    x0r = y1r + y3i;
+    x0i = y1i - y3r;
+    x1r = y5r + y7r;
+    x1i = y5i + y7i;
+    a[12] = x0r - x1i;
+    a[13] = x0i + x1r;
+    a[14] = x0r + x1i;
+    a[15] = x0i - x1r;
+}
+
+
+void cftf040(double *a)
+{
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    x0r = a[0] + a[4];
+    x0i = a[1] + a[5];
+    x1r = a[0] - a[4];
+    x1i = a[1] - a[5];
+    x2r = a[2] + a[6];
+    x2i = a[3] + a[7];
+    x3r = a[2] - a[6];
+    x3i = a[3] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x1r - x3i;
+    a[3] = x1i + x3r;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+}
+
+
+void cftb040(double *a)
+{
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    x0r = a[0] + a[4];
+    x0i = a[1] + a[5];
+    x1r = a[0] - a[4];
+    x1i = a[1] - a[5];
+    x2r = a[2] + a[6];
+    x2i = a[3] + a[7];
+    x3r = a[2] - a[6];
+    x3i = a[3] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x1r + x3i;
+    a[3] = x1i - x3r;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[6] = x1r - x3i;
+    a[7] = x1i + x3r;
+}
+
+
+void cftx020(double *a)
+{
+    double x0r, x0i;
+    
+    x0r = a[0] - a[2];
+    x0i = a[1] - a[3];
+    a[0] += a[2];
+    a[1] += a[3];
+    a[2] = x0r;
+    a[3] = x0i;
+}
+
+
+void rftfsub(int n, double *a)
+{
+    int i, i0, j, k;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = 2 * M_PI_2 / n;
+    wkr = 0;
+    wki = 0;
+    wdi = cos(ec);
+    wdr = sin(ec);
+    wdi *= wdr;
+    wdr *= wdr;
+    w1r = 1 - 2 * wdr;
+    w1i = 2 * wdi;
+    ss = 2 * w1i;
+    i = n >> 1;
+    for (;;) {
+        i0 = i - 4 * RDFT_LOOP_DIV;
+        if (i0 < 4) {
+            i0 = 4;
+        }
+        for (j = i - 4; j >= i0; j -= 4) {
+            k = n - j;
+            xr = a[j + 2] - a[k - 2];
+            xi = a[j + 3] + a[k - 1];
+            yr = wdr * xr - wdi * xi;
+            yi = wdr * xi + wdi * xr;
+            a[j + 2] -= yr;
+            a[j + 3] -= yi;
+            a[k - 2] += yr;
+            a[k - 1] -= yi;
+            wkr += ss * wdi;
+            wki += ss * (0.5 - wdr);
+            xr = a[j] - a[k];
+            xi = a[j + 1] + a[k + 1];
+            yr = wkr * xr - wki * xi;
+            yi = wkr * xi + wki * xr;
+            a[j] -= yr;
+            a[j + 1] -= yi;
+            a[k] += yr;
+            a[k + 1] -= yi;
+            wdr += ss * wki;
+            wdi += ss * (0.5 - wkr);
+        }
+        if (i0 == 4) {
+            break;
+        }
+        wkr = 0.5 * sin(ec * i0);
+        wki = 0.5 * cos(ec * i0);
+        wdr = 0.5 - (wkr * w1r - wki * w1i);
+        wdi = wkr * w1i + wki * w1r;
+        wkr = 0.5 - wkr;
+        i = i0;
+    }
+    xr = a[2] - a[n - 2];
+    xi = a[3] + a[n - 1];
+    yr = wdr * xr - wdi * xi;
+    yi = wdr * xi + wdi * xr;
+    a[2] -= yr;
+    a[3] -= yi;
+    a[n - 2] += yr;
+    a[n - 1] -= yi;
+}
+
+
+void rftbsub(int n, double *a)
+{
+    int i, i0, j, k;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = 2 * M_PI_2 / n;
+    wkr = 0;
+    wki = 0;
+    wdi = cos(ec);
+    wdr = sin(ec);
+    wdi *= wdr;
+    wdr *= wdr;
+    w1r = 1 - 2 * wdr;
+    w1i = 2 * wdi;
+    ss = 2 * w1i;
+    i = n >> 1;
+    for (;;) {
+        i0 = i - 4 * RDFT_LOOP_DIV;
+        if (i0 < 4) {
+            i0 = 4;
+        }
+        for (j = i - 4; j >= i0; j -= 4) {
+            k = n - j;
+            xr = a[j + 2] - a[k - 2];
+            xi = a[j + 3] + a[k - 1];
+            yr = wdr * xr + wdi * xi;
+            yi = wdr * xi - wdi * xr;
+            a[j + 2] -= yr;
+            a[j + 3] -= yi;
+            a[k - 2] += yr;
+            a[k - 1] -= yi;
+            wkr += ss * wdi;
+            wki += ss * (0.5 - wdr);
+            xr = a[j] - a[k];
+            xi = a[j + 1] + a[k + 1];
+            yr = wkr * xr + wki * xi;
+            yi = wkr * xi - wki * xr;
+            a[j] -= yr;
+            a[j + 1] -= yi;
+            a[k] += yr;
+            a[k + 1] -= yi;
+            wdr += ss * wki;
+            wdi += ss * (0.5 - wkr);
+        }
+        if (i0 == 4) {
+            break;
+        }
+        wkr = 0.5 * sin(ec * i0);
+        wki = 0.5 * cos(ec * i0);
+        wdr = 0.5 - (wkr * w1r - wki * w1i);
+        wdi = wkr * w1i + wki * w1r;
+        wkr = 0.5 - wkr;
+        i = i0;
+    }
+    xr = a[2] - a[n - 2];
+    xi = a[3] + a[n - 1];
+    yr = wdr * xr + wdi * xi;
+    yi = wdr * xi - wdi * xr;
+    a[2] -= yr;
+    a[3] -= yi;
+    a[n - 2] += yr;
+    a[n - 1] -= yi;
+}
+
+
+void dctsub(int n, double *a)
+{
+    int i, i0, j, k, m;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = M_PI_2 / n;
+    wkr = 0.5;
+    wki = 0.5;
+    w1r = cos(ec);
+    w1i = sin(ec);
+    wdr = 0.5 * (w1r - w1i);
+    wdi = 0.5 * (w1r + w1i);
+    ss = 2 * w1i;
+    m = n >> 1;
+    i = 0;
+    for (;;) {
+        i0 = i + 2 * DCST_LOOP_DIV;
+        if (i0 > m - 2) {
+            i0 = m - 2;
+        }
+        for (j = i + 2; j <= i0; j += 2) {
+            k = n - j;
+            xr = wdi * a[j - 1] - wdr * a[k + 1];
+            xi = wdr * a[j - 1] + wdi * a[k + 1];
+            wkr -= ss * wdi;
+            wki += ss * wdr;
+            yr = wki * a[j] - wkr * a[k];
+            yi = wkr * a[j] + wki * a[k];
+            wdr -= ss * wki;
+            wdi += ss * wkr;
+            a[k + 1] = xr;
+            a[k] = yr;
+            a[j - 1] = xi;
+            a[j] = yi;
+        }
+        if (i0 == m - 2) {
+            break;
+        }
+        wdr = cos(ec * i0);
+        wdi = sin(ec * i0);
+        wkr = 0.5 * (wdr - wdi);
+        wki = 0.5 * (wdr + wdi);
+        wdr = wkr * w1r - wki * w1i;
+        wdi = wkr * w1i + wki * w1r;
+        i = i0;
+    }
+    xr = wdi * a[m - 1] - wdr * a[m + 1];
+    a[m - 1] = wdr * a[m - 1] + wdi * a[m + 1];
+    a[m + 1] = xr;
+    a[m] *= WR5000;
+}
+
+
+void dstsub(int n, double *a)
+{
+    int i, i0, j, k, m;
+    double ec, w1r, w1i, wkr, wki, wdr, wdi, ss, xr, xi, yr, yi;
+    
+    ec = M_PI_2 / n;
+    wkr = 0.5;
+    wki = 0.5;
+    w1r = cos(ec);
+    w1i = sin(ec);
+    wdr = 0.5 * (w1r - w1i);
+    wdi = 0.5 * (w1r + w1i);
+    ss = 2 * w1i;
+    m = n >> 1;
+    i = 0;
+    for (;;) {
+        i0 = i + 2 * DCST_LOOP_DIV;
+        if (i0 > m - 2) {
+            i0 = m - 2;
+        }
+        for (j = i + 2; j <= i0; j += 2) {
+            k = n - j;
+            xr = wdi * a[k + 1] - wdr * a[j - 1];
+            xi = wdr * a[k + 1] + wdi * a[j - 1];
+            wkr -= ss * wdi;
+            wki += ss * wdr;
+            yr = wki * a[k] - wkr * a[j];
+            yi = wkr * a[k] + wki * a[j];
+            wdr -= ss * wki;
+            wdi += ss * wkr;
+            a[j - 1] = xr;
+            a[j] = yr;
+            a[k + 1] = xi;
+            a[k] = yi;
+        }
+        if (i0 == m - 2) {
+            break;
+        }
+        wdr = cos(ec * i0);
+        wdi = sin(ec * i0);
+        wkr = 0.5 * (wdr - wdi);
+        wki = 0.5 * (wdr + wdi);
+        wdr = wkr * w1r - wki * w1i;
+        wdi = wkr * w1i + wki * w1r;
+        i = i0;
+    }
+    xr = wdi * a[m + 1] - wdr * a[m - 1];
+    a[m + 1] = wdr * a[m + 1] + wdi * a[m - 1];
+    a[m - 1] = xr;
+    a[m] *= WR5000;
+}
+
+
+void dctsub4(int n, double *a)
+{
+    int m;
+    double wki, wdr, wdi, xr;
+    
+    wki = WR5000;
+    m = n >> 1;
+    if (m == 2) {
+        wdr = wki * WI2500;
+        wdi = wki * WR2500;
+        xr = wdi * a[1] - wdr * a[3];
+        a[1] = wdr * a[1] + wdi * a[3];
+        a[3] = xr;
+    }
+    a[m] *= wki;
+}
+
+
+void dstsub4(int n, double *a)
+{
+    int m;
+    double wki, wdr, wdi, xr;
+    
+    wki = WR5000;
+    m = n >> 1;
+    if (m == 2) {
+        wdr = wki * WI2500;
+        wdi = wki * WR2500;
+        xr = wdi * a[3] - wdr * a[1];
+        a[3] = wdr * a[3] + wdi * a[1];
+        a[1] = xr;
+    }
+    a[m] *= wki;
+}
+
diff --git a/readme.txt b/readme.txt
new file mode 100644
index 0000000..145a081
--- /dev/null
+++ b/readme.txt
@@ -0,0 +1,167 @@
+General Purpose FFT (Fast Fourier/Cosine/Sine Transform) Package
+
+Description:
+    A package to calculate Discrete Fourier/Cosine/Sine Transforms of 
+    1-dimensional sequences of length 2^N.
+
+Files:
+    fft4g.c    : FFT Package in C       - Fast Version   I   (radix 4,2)
+    fft4g.f    : FFT Package in Fortran - Fast Version   I   (radix 4,2)
+    fft4g_h.c  : FFT Package in C       - Simple Version I   (radix 4,2)
+    fft8g.c    : FFT Package in C       - Fast Version   II  (radix 8,4,2)
+    fft8g.f    : FFT Package in Fortran - Fast Version   II  (radix 8,4,2)
+    fft8g_h.c  : FFT Package in C       - Simple Version II  (radix 8,4,2)
+    fftsg.c    : FFT Package in C       - Fast Version   III (Split-Radix)
+    fftsg.f    : FFT Package in Fortran - Fast Version   III (Split-Radix)
+    fftsg_h.c  : FFT Package in C       - Simple Version III (Split-Radix)
+    readme.txt : Readme File
+    sample1/   : Test Directory
+        Makefile    : for gcc, cc
+        Makefile.f77: for Fortran
+        testxg.c    : Test Program for "fft*g.c"
+        testxg.f    : Test Program for "fft*g.f"
+        testxg_h.c  : Test Program for "fft*g_h.c"
+    sample2/   : Benchmark Directory
+        Makefile    : for gcc, cc
+        Makefile.pth: POSIX Thread version
+        pi_fft.c    : PI(= 3.1415926535897932384626...) Calculation Program
+                      for a Benchmark Test for "fft*g.c"
+
+Difference of the Files:
+    C and Fortran versions are equal and 
+    the same routines are in each version.
+    "fft4g*.*" are optimized for most machines.
+    "fft8g*.*" are fast on the UltraSPARC.
+    "fftsg*.*" are optimized for the machines that 
+    have the multi-level (L1,L2,etc) cache.
+    The simple versions "fft*g_h.c" use no work area, but 
+    the fast versions "fft*g.*" use work areas.
+    The fast versions "fft*g.*" have the same specification.
+
+Routines in the Package:
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+
+Usage:
+    Please refer to the comments in the "fft**.*" file which 
+    you want to use. Brief explanations are in the block 
+    comments of each package. The examples are also given in 
+    the test programs.
+
+Method:
+    -------- cdft --------
+    fft4g*.*, fft8g*.*:
+        A method of in-place, radix 2^M, Sande-Tukey (decimation in 
+        frequency). Index of the butterfly loop is in bit 
+        reverse order to keep continuous memory access.
+    fftsg*.*:
+        A method of in-place, Split-Radix, recursive fast 
+        algorithm.
+    -------- rdft --------
+    A method with a following butterfly operation appended to "cdft".
+    In forward transform :
+        A[k] = sum_j=0^n-1 a[j]*W(n)^(j*k), 0<=k<=n/2, 
+            W(n) = exp(2*pi*i/n), 
+    this routine makes an array x[] :
+        x[j] = a[2*j] + i*a[2*j+1], 0<=j<n/2
+    and calls "cdft" of length n/2 :
+        X[k] = sum_j=0^n/2-1 x[j] * W(n/2)^(j*k), 0<=k<n.
+    The result A[k] are :
+        A[k]     = X[k]     - (1+i*W(n)^k)/2 * (X[k]-conjg(X[n/2-k])), 
+        A[n/2-k] = X[n/2-k] + 
+                        conjg((1+i*W(n)^k)/2 * (X[k]-conjg(X[n/2-k]))), 
+            0<=k<=n/2
+        (notes: conjg() is a complex conjugate, X[n/2]=X[0]).
+    -------- ddct --------
+    A method with a following butterfly operation appended to "rdft".
+    In backward transform :
+        C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n, 
+    this routine makes an array r[] :
+        r[0] = a[0], 
+        r[j]   = Re((a[j] - i*a[n-j]) * W(4*n)^j*(1+i)/2), 
+        r[n-j] = Im((a[j] - i*a[n-j]) * W(4*n)^j*(1+i)/2), 
+            0<j<=n/2
+    and calls "rdft" of length n :
+        A[k] = sum_j=0^n-1 r[j]*W(n)^(j*k), 0<=k<=n/2, 
+            W(n) = exp(2*pi*i/n).
+    The result C[k] are :
+        C[2*k]   =  Re(A[k] * (1-i)), 
+        C[2*k-1] = -Im(A[k] * (1-i)).
+    -------- ddst --------
+    A method with a following butterfly operation appended to "rdft".
+    In backward transform :
+        S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n, 
+    this routine makes an array r[] :
+        r[0] = a[0], 
+        r[j]   = Im((a[n-j] - i*a[j]) * W(4*n)^j*(1+i)/2), 
+        r[n-j] = Re((a[n-j] - i*a[j]) * W(4*n)^j*(1+i)/2), 
+            0<j<=n/2
+    and calls "rdft" of length n :
+        A[k] = sum_j=0^n-1 r[j]*W(n)^(j*k), 0<=k<=n/2, 
+            W(n) = exp(2*pi*i/n).
+    The result S[k] are :
+        S[2*k]   =  Re(A[k] * (1+i)), 
+        S[2*k-1] = -Im(A[k] * (1+i)).
+    -------- dfct --------
+    A method to split into "dfct" and "ddct" of half length.
+    The transform :
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    is divided into :
+        C[2*k]   = sum'_j=0^n/2  (a[j]+a[n-j])*cos(pi*j*k/(n/2)), 
+        C[2*k+1] = sum_j=0^n/2-1 (a[j]-a[n-j])*cos(pi*j*(k+1/2)/(n/2))
+        (sum' is a summation whose last term multiplies 1/2).
+    This routine uses "ddct" recursively.
+    To keep the in-place operation, the data in fft*g_h.*
+    are sorted in bit reversal order.
+    -------- dfst --------
+    A method to split into "dfst" and "ddst" of half length.
+    The transform :
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    is divided into :
+        S[2*k]   = sum_j=1^n/2-1 (a[j]-a[n-j])*sin(pi*j*k/(n/2)), 
+        S[2*k+1] = sum'_j=1^n/2  (a[j]+a[n-j])*sin(pi*j*(k+1/2)/(n/2))
+        (sum' is a summation whose last term multiplies 1/2).
+    This routine uses "ddst" recursively.
+    To keep the in-place operation, the data in fft*g_h.*
+    are sorted in bit reversal order.
+
+Reference:
+    * Masatake MORI, Makoto NATORI, Tatuo TORII: Suchikeisan, 
+      Iwanamikouzajyouhoukagaku18, Iwanami, 1982 (Japanese)
+    * Henri J. Nussbaumer: Fast Fourier Transform and Convolution 
+      Algorithms, Springer Verlag, 1982
+    * C. S. Burrus, Notes on the FFT (with large FFT paper list)
+      http://www-dsp.rice.edu/research/fft/fftnote.asc
+
+Copyright:
+    Copyright(C) 1996-2001 Takuya OOURA
+    email: ooura@mmm.t.u-tokyo.ac.jp
+    download: http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html
+    You may use, copy, modify this code for any purpose and 
+    without fee. You may distribute this ORIGINAL package.
+
+History:
+    ...
+    Dec. 1995  : Edit the General Purpose FFT
+    Mar. 1996  : Change the specification
+    Jun. 1996  : Change the method of trigonometric function table
+    Sep. 1996  : Modify the documents
+    Feb. 1997  : Change the butterfly loops
+    Dec. 1997  : Modify the documents
+    Dec. 1997  : Add "fft4g.*"
+    Jul. 1998  : Fix some bugs in the documents
+    Jul. 1998  : Add "fft8g.*" and delete "fft4f.*"
+    Jul. 1998  : Add a benchmark program "pi_fft.c"
+    Jul. 1999  : Add a simple version "fft*g_h.c"
+    Jul. 1999  : Add a Split-Radix FFT package "fftsg*.c"
+    Sep. 1999  : Reduce the memory operation (minor optimization)
+    Oct. 1999  : Change the butterfly structure of "fftsg*.c"
+    Oct. 1999  : Save the code size
+    Sep. 2001  : Add "fftsg.f"
+    Sep. 2001  : Add Pthread & Win32thread routines to "fftsg*.c"
+    Dec. 2006  : Fix a minor bug in "fftsg.f"
+
diff --git a/readme2d.txt b/readme2d.txt
new file mode 100644
index 0000000..7cc4dfa
--- /dev/null
+++ b/readme2d.txt
@@ -0,0 +1,77 @@
+General Purpose 2D,3D FFT (Fast Fourier Transform) Package
+
+Files
+    alloc.c    : 2D-array Allocation
+    alloc.h    : 2D-array Allocation
+    fft4f2d.c  : 2D FFT Package in C       - Version I (radix 4, 2)
+    fft4f2d.f  : 2D FFT Package in Fortran - Version I (radix 4, 2)
+    fftsg.c    : 1D FFT Package in C       - Fast Version (Split-Radix)
+    fftsg.f    : 1D FFT Package in Fortran - Fast Version (Split-Radix)
+    fftsg2d.c  : 2D FFT Package in C       - Version II (Split-Radix)
+    fftsg2d.f  : 2D FFT Package in Fortran - Version II (Split-Radix)
+    fftsg3d.c  : 3D FFT Package in C       - Version II (Split-Radix)
+    fftsg3d.f  : 3D FFT Package in Fortran - Version II (Split-Radix)
+    shrtdct.c  : 8x8, 16x16 DCT Package
+    sample2d/
+        Makefile    : for gcc, cc
+        Makefile.f77: for Fortran
+        Makefile.pth: Pthread version
+        fft4f2dt.c  : Test Program for "fft4f2d.c"
+        fft4f2dt.f  : Test Program for "fft4f2d.f"
+        fftsg2dt.c  : Test Program for "fftsg2d.c"
+        fftsg2dt.f  : Test Program for "fftsg2d.f"
+        fftsg3dt.c  : Test Program for "fftsg3d.c"
+        fftsg3dt.f  : Test Program for "fftsg3d.f"
+        shrtdctt.c  : Test Program for "shrtdct.c"
+
+Difference of Files
+    C and Fortran versions are equal and 
+    the same routines are in each version.
+    ---- Difference between "fft4f2d.*" and "fftsg2d.*" ----
+    "fft4f2d.*" are optimized for the old machines that 
+    don't have the large size CPU cache.
+    "fftsg2d.*", "fftsg3d.*" use 1D FFT routines in "fftsg.*".
+    "fftsg2d.*", "fftsg3d.*" are optimized for the machines that 
+    have the multi-level (L1,L2,etc) cache.
+
+Routines in the Package
+    in fft4f2d.*, fftsg2d.*
+        cdft2d: 2-dim Complex Discrete Fourier Transform
+        rdft2d: 2-dim Real Discrete Fourier Transform
+        ddct2d: 2-dim Discrete Cosine Transform
+        ddst2d: 2-dim Discrete Sine Transform
+        rdft2dsort: rdft2d input/output ordering (fftsg2d.*)
+    in fftsg3d.*
+        cdft3d: 3-dim Complex Discrete Fourier Transform
+        rdft3d: 3-dim Real Discrete Fourier Transform
+        ddct3d: 3-dim Discrete Cosine Transform
+        ddst3d: 3-dim Discrete Sine Transform
+        rdft3dsort: rdft3d input/output ordering
+    in fftsg.*
+        cdft: 1-dim Complex Discrete Fourier Transform
+        rdft: 1-dim Real Discrete Fourier Transform
+        ddct: 1-dim Discrete Cosine Transform
+        ddst: 1-dim Discrete Sine Transform
+        dfct: 1-dim Real Symmetric DFT
+        dfst: 1-dim Real Anti-symmetric DFT
+        (these routines are called by fftsg2d.*, fftsg3d.*)
+    in shrtdct.c
+        ddct8x8s  : Normalized 8x8 DCT
+        ddct16x16s: Normalized 16x16 DCT
+        (faster than ddct2d())
+
+Usage
+    Brief explanations are in block comments of each packages.
+    The examples are given in the test programs.
+
+Copyright
+    Copyright(C) 1997,2001 Takuya OOURA (email: ooura@kurims.kyoto-u.ac.jp).
+    You may use, copy, modify this code for any purpose and 
+    without fee. You may distribute this ORIGINAL package.
+
+History
+    ...
+    Nov. 2001  : Add 3D-FFT routines
+    Dec. 2006  : Fix a documentation bug in "fftsg3d.*"
+    Dec. 2006  : Fix a minor bug in "fftsg.f"
+
diff --git a/sample1/.Makefile.kate-swp b/sample1/.Makefile.kate-swp
new file mode 100644
index 0000000..3e61931
--- /dev/null
+++ b/sample1/.Makefile.kate-swp
Binary files differ
diff --git a/sample1/Makefile.f77 b/sample1/Makefile.f77
new file mode 100644
index 0000000..055a77c
--- /dev/null
+++ b/sample1/Makefile.f77
@@ -0,0 +1,51 @@
+# ---- for GNU g77 ----
+
+F77 = g77
+
+FFLAGS = -Wall
+
+OFLAGS = -O2
+
+# ---- for SUN WS f77 ----
+#
+#F77 = f77
+#
+#FFLAGS = 
+#
+#OFLAGS = -xO2
+
+
+
+
+all: test4g_f test8g_f testsg_f
+
+
+test4g_f : testxg_f.o fft4g_f.o
+	$(F77) testxg_f.o fft4g_f.o -o test4g_f
+
+test8g_f : testxg_f.o fft8g_f.o
+	$(F77) testxg_f.o fft8g_f.o -o test8g_f
+
+testsg_f : testxg_f.o fftsg_f.o
+	$(F77) testxg_f.o fftsg_f.o -o testsg_f
+
+
+testxg_f.o : testxg.f
+	$(F77) $(FFLAGS) $(OFLAGS) -c testxg.f -o testxg_f.o
+
+
+fft4g_f.o : ../fft4g.f
+	$(F77) $(FFLAGS) $(OFLAGS) -c ../fft4g.f -o fft4g_f.o
+
+fft8g_f.o : ../fft8g.f
+	$(F77) $(FFLAGS) $(OFLAGS) -c ../fft8g.f -o fft8g_f.o
+
+fftsg_f.o : ../fftsg.f
+	$(F77) $(FFLAGS) $(OFLAGS) -c ../fftsg.f -o fftsg_f.o
+
+
+
+
+clean:
+	rm -f *.o
+
diff --git a/sample1/testxg.c b/sample1/testxg.c
new file mode 100644
index 0000000..f351308
--- /dev/null
+++ b/sample1/testxg.c
@@ -0,0 +1,107 @@
+/* test of fft*g.c */
+
+#include <math.h>
+#include <stdio.h>
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+/* random number generator, 0 <= RND < 1 */
+#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200.0))
+
+#ifndef NMAX
+#define NMAX 8192
+#define NMAXSQRT 64
+#endif
+
+void cdft(int, int, double *, int *, double *);
+void rdft(int, int, double *, int *, double *);
+void ddct(int, int, double *, int *, double *);
+void ddst(int, int, double *, int *, double *);
+void dfct(int, double *, double *, int *, double *);
+void dfst(int, double *, double *, int *, double *);
+void putdata(int nini, int nend, double *a);
+double errorcheck(int nini, int nend, double scale, double *a);
+
+
+int main()
+{
+    int n, ip[NMAXSQRT + 2];
+    double a[NMAX + 1], w[NMAX * 5 / 4], t[NMAX / 2 + 1], err;
+
+    printf("data length n=? (must be 2^m)\n");
+    scanf("%d", &n);
+    ip[0] = 0;
+
+    /* check of CDFT */
+    putdata(0, n - 1, a);
+    cdft(n, 1, a, ip, w);
+    cdft(n, -1, a, ip, w);
+    err = errorcheck(0, n - 1, 2.0 / n, a);
+    printf("cdft err= %g \n", err);
+
+    /* check of RDFT */
+    putdata(0, n - 1, a);
+    rdft(n, 1, a, ip, w);
+    rdft(n, -1, a, ip, w);
+    err = errorcheck(0, n - 1, 2.0 / n, a);
+    printf("rdft err= %g \n", err);
+
+    /* check of DDCT */
+    putdata(0, n - 1, a);
+    ddct(n, 1, a, ip, w);
+    ddct(n, -1, a, ip, w);
+    a[0] *= 0.5;
+    err = errorcheck(0, n - 1, 2.0 / n, a);
+    printf("ddct err= %g \n", err);
+
+    /* check of DDST */
+    putdata(0, n - 1, a);
+    ddst(n, 1, a, ip, w);
+    ddst(n, -1, a, ip, w);
+    a[0] *= 0.5;
+    err = errorcheck(0, n - 1, 2.0 / n, a);
+    printf("ddst err= %g \n", err);
+
+    /* check of DFCT */
+    putdata(0, n, a);
+    a[0] *= 0.5;
+    a[n] *= 0.5;
+    dfct(n, a, t, ip, w);
+    a[0] *= 0.5;
+    a[n] *= 0.5;
+    dfct(n, a, t, ip, w);
+    err = errorcheck(0, n, 2.0 / n, a);
+    printf("dfct err= %g \n", err);
+
+    /* check of DFST */
+    putdata(1, n - 1, a);
+    dfst(n, a, t, ip, w);
+    dfst(n, a, t, ip, w);
+    err = errorcheck(1, n - 1, 2.0 / n, a);
+    printf("dfst err= %g \n", err);
+
+    return 0;
+}
+
+
+void putdata(int nini, int nend, double *a)
+{
+    int j, seed = 0;
+
+    for (j = nini; j <= nend; j++) {
+        a[j] = RND(&seed);
+    }
+}
+
+
+double errorcheck(int nini, int nend, double scale, double *a)
+{
+    int j, seed = 0;
+    double err = 0, e;
+
+    for (j = nini; j <= nend; j++) {
+        e = RND(&seed) - a[j] * scale;
+        err = MAX(err, fabs(e));
+    }
+    return err;
+}
+
diff --git a/sample1/testxg.f b/sample1/testxg.f
new file mode 100644
index 0000000..7c6ae75
--- /dev/null
+++ b/sample1/testxg.f
@@ -0,0 +1,96 @@
+! test of fft*g.f
+!
+!
+      program main
+      integer nmax, nmaxsqrt
+      parameter (nmax = 32768)
+      parameter (nmaxsqrt = 128)
+      integer n, ip(0 : nmaxsqrt + 1)
+      real*8 a(0 : nmax), w(0 : nmax * 5 / 4 - 1), t(0 : nmax / 2), 
+     &    err, errorcheck
+!
+      write (*, *) 'data length n=? (must be 2**m)'
+      read (*, *) n
+      ip(0) = 0
+!
+!   check of CDFT
+      call putdata(0, n - 1, a)
+      call cdft(n, 1, a, ip, w)
+      call cdft(n, -1, a, ip, w)
+      err = errorcheck(0, n - 1, 2.0d0 / n, a)
+      write (*, *) 'cdft err= ', err
+!
+!   check of RDFT
+      call putdata(0, n - 1, a)
+      call rdft(n, 1, a, ip, w)
+      call rdft(n, -1, a, ip, w)
+      err = errorcheck(0, n - 1, 2.0d0 / n, a)
+      write (*, *) 'rdft err= ', err
+!
+!   check of DDCT
+      call putdata(0, n - 1, a)
+      call ddct(n, 1, a, ip, w)
+      call ddct(n, -1, a, ip, w)
+      a(0) = a(0) * 0.5d0
+      err = errorcheck(0, n - 1, 2.0d0 / n, a)
+      write (*, *) 'ddct err= ', err
+!
+!   check of DDST
+      call putdata(0, n - 1, a)
+      call ddst(n, 1, a, ip, w)
+      call ddst(n, -1, a, ip, w)
+      a(0) = a(0) * 0.5d0
+      err = errorcheck(0, n - 1, 2.0d0 / n, a)
+      write (*, *) 'ddst err= ', err
+!
+!   check of DFCT
+      call putdata(0, n, a)
+      a(0) = a(0) * 0.5d0
+      a(n) = a(n) * 0.5d0
+      call dfct(n, a, t, ip, w)
+      a(0) = a(0) * 0.5d0
+      a(n) = a(n) * 0.5d0
+      call dfct(n, a, t, ip, w)
+      err = errorcheck(0, n, 2.0d0 / n, a)
+      write (*, *) 'dfct err= ', err
+!
+!   check of DFST
+      call putdata(1, n - 1, a)
+      call dfst(n, a, t, ip, w)
+      call dfst(n, a, t, ip, w)
+      err = errorcheck(1, n - 1, 2.0d0 / n, a)
+      write (*, *) 'dfst err= ', err
+!
+      end
+!
+!
+      subroutine putdata(nini, nend, a)
+      integer nini, nend, j, seed
+      real*8 a(0 : *), drnd
+      seed = 0
+      do j = nini, nend
+          a(j) = drnd(seed)
+      end do
+      end
+!
+!
+      function errorcheck(nini, nend, scale, a)
+      integer nini, nend, j, seed
+      real*8 scale, a(0 : *), drnd, err, e, errorcheck
+      err = 0
+      seed = 0
+      do j = nini, nend
+          e = drnd(seed) - a(j) * scale
+          err = max(err, abs(e))
+      end do
+      errorcheck = err
+      end
+!
+!
+! random number generator, 0 <= drnd < 1
+      real*8 function drnd(seed)
+      integer seed
+      seed = mod(seed * 7141 + 54773, 259200)
+      drnd = seed * (1.0d0 / 259200.0d0)
+      end
+!
diff --git a/sample1/testxg_h.c b/sample1/testxg_h.c
new file mode 100644
index 0000000..3227b5d
--- /dev/null
+++ b/sample1/testxg_h.c
@@ -0,0 +1,105 @@
+/* test of fft*g_h.c */
+
+#include <math.h>
+#include <stdio.h>
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+/* random number generator, 0 <= RND < 1 */
+#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200.0))
+
+#ifndef NMAX
+#define NMAX 8192
+#endif
+
+void cdft(int, int, double *);
+void rdft(int, int, double *);
+void ddct(int, int, double *);
+void ddst(int, int, double *);
+void dfct(int, double *);
+void dfst(int, double *);
+void putdata(int nini, int nend, double *a);
+double errorcheck(int nini, int nend, double scale, double *a);
+
+
+int main()
+{
+    int n;
+    double a[NMAX + 1], err;
+
+    printf("data length n=? (must be 2^m)\n");
+    scanf("%d", &n);
+
+    /* check of CDFT */
+    putdata(0, n - 1, a);
+    cdft(n, 1, a);
+    cdft(n, -1, a);
+    err = errorcheck(0, n - 1, 2.0 / n, a);
+    printf("cdft err= %g \n", err);
+
+    /* check of RDFT */
+    putdata(0, n - 1, a);
+    rdft(n, 1, a);
+    rdft(n, -1, a);
+    err = errorcheck(0, n - 1, 2.0 / n, a);
+    printf("rdft err= %g \n", err);
+
+    /* check of DDCT */
+    putdata(0, n - 1, a);
+    ddct(n, 1, a);
+    ddct(n, -1, a);
+    a[0] *= 0.5;
+    err = errorcheck(0, n - 1, 2.0 / n, a);
+    printf("ddct err= %g \n", err);
+
+    /* check of DDST */
+    putdata(0, n - 1, a);
+    ddst(n, 1, a);
+    ddst(n, -1, a);
+    a[0] *= 0.5;
+    err = errorcheck(0, n - 1, 2.0 / n, a);
+    printf("ddst err= %g \n", err);
+
+    /* check of DFCT */
+    putdata(0, n, a);
+    a[0] *= 0.5;
+    a[n] *= 0.5;
+    dfct(n, a);
+    a[0] *= 0.5;
+    a[n] *= 0.5;
+    dfct(n, a);
+    err = errorcheck(0, n, 2.0 / n, a);
+    printf("dfct err= %g \n", err);
+
+    /* check of DFST */
+    putdata(1, n - 1, a);
+    dfst(n, a);
+    dfst(n, a);
+    err = errorcheck(1, n - 1, 2.0 / n, a);
+    printf("dfst err= %g \n", err);
+
+    return 0;
+}
+
+
+void putdata(int nini, int nend, double *a)
+{
+    int j, seed = 0;
+
+    for (j = nini; j <= nend; j++) {
+        a[j] = RND(&seed);
+    }
+}
+
+
+double errorcheck(int nini, int nend, double scale, double *a)
+{
+    int j, seed = 0;
+    double err = 0, e;
+
+    for (j = nini; j <= nend; j++) {
+        e = RND(&seed) - a[j] * scale;
+        err = MAX(err, fabs(e));
+    }
+    return err;
+}
+
diff --git a/sample2/Makefile.pth b/sample2/Makefile.pth
new file mode 100644
index 0000000..d173491
--- /dev/null
+++ b/sample2/Makefile.pth
@@ -0,0 +1,41 @@
+# ---- for GNU gcc ----
+
+CC = gcc
+
+CFLAGS = -Wall -DUSE_CDFT_PTHREADS
+
+OFLAGS_FFT = -O6 -ffast-math
+OFLAGS_PI = -O6 -ffast-math
+
+# ---- for SUN WS cc ----
+#
+#CC = cc
+#
+#CFLAGS = -DUSE_CDFT_PTHREADS
+#
+#OFLAGS_FFT = -fast -xO5
+#OFLAGS_PI = -fast -xO5
+
+
+
+
+all: pi_fftsgpt
+
+
+pi_fftsgpt : pi_fft.o fftsgpt.o
+	$(CC) pi_fft.o fftsgpt.o -lm -lpthread -o pi_fftsgpt
+
+
+pi_fft.o : pi_fft.c
+	$(CC) $(CFLAGS) $(OFLAGS_PI) -c pi_fft.c -o pi_fft.o
+
+
+fftsgpt.o : ../fftsg.c
+	$(CC) $(CFLAGS) $(OFLAGS_FFT) -c ../fftsg.c -o fftsgpt.o
+
+
+
+
+clean:
+	rm -f *.o
+
diff --git a/sample2/pi_fft.c b/sample2/pi_fft.c
new file mode 100644
index 0000000..c9a76bf
--- /dev/null
+++ b/sample2/pi_fft.c
@@ -0,0 +1,1616 @@
+/*
+---- calculation of PI(= 3.14159...) using FFT ----
+    by T.Ooura, ver. LG1.1.2-MP1.5a Sep. 2001.
+
+This is a test program to estimate the performance of
+the FFT routines: fft*g.c.
+
+Example compilation:
+    GNU      : gcc -O6 -ffast-math pi_fft.c fftsg.c -lm -o pi_fftsg
+    SUN      : cc -fast -xO5 pi_fft.c fft8g.c -lm -o pi_fft8g
+    Microsoft: cl /O2 /G6 pi_fft.c fft4g.c /Fepi_fft4g.exe
+    ...
+    etc.
+*/
+
+/* Please check the following macros before compiling */
+#ifndef DBL_ERROR_MARGIN
+#define DBL_ERROR_MARGIN 0.3  /* must be < 0.5 */
+#endif
+
+
+#include <math.h>
+#include <limits.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+
+void mp_load_0(int n, int radix, int out[]);
+void mp_load_1(int n, int radix, int out[]);
+void mp_copy(int n, int radix, int in[], int out[]);
+void mp_round(int n, int radix, int m, int inout[]);
+int mp_cmp(int n, int radix, int in1[], int in2[]);
+void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+void mp_imul(int n, int radix, int in1[], int in2, int out[]);
+int mp_idiv(int n, int radix, int in1[], int in2, int out[]);
+void mp_idiv_2(int n, int radix, int in[], int out[]);
+double mp_mul_radix_test(int n, int radix, int nfft, 
+        double tmpfft[], int ip[], double w[]);
+void mp_mul(int n, int radix, int in1[], int in2[], int out[], 
+        int tmp[], int nfft, double tmp1fft[], double tmp2fft[], 
+        double tmp3fft[], int ip[], double w[]);
+void mp_squ(int n, int radix, int in[], int out[], int tmp[], 
+        int nfft, double tmp1fft[], double tmp2fft[], 
+        int ip[], double w[]);
+void mp_mulh(int n, int radix, int in1[], int in2[], int out[], 
+        int nfft, double in1fft[], double outfft[], 
+        int ip[], double w[]);
+void mp_squh(int n, int radix, int in[], int out[], 
+        int nfft, double inoutfft[], int ip[], double w[]);
+int mp_inv(int n, int radix, int in[], int out[], 
+        int tmp1[], int tmp2[], int nfft, 
+        double tmp1fft[], double tmp2fft[], int ip[], double w[]);
+int mp_sqrt(int n, int radix, int in[], int out[], 
+        int tmp1[], int tmp2[], int nfft, 
+        double tmp1fft[], double tmp2fft[], int ip[], double w[]);
+void mp_sprintf(int n, int log10_radix, int in[], char out[]);
+void mp_sscanf(int n, int log10_radix, char in[], int out[]);
+void mp_fprintf(int n, int log10_radix, int in[], FILE *fout);
+
+
+int main()
+{
+    int nfft, log2_nfft, radix, log10_radix, n, npow, nprc;
+    double err, d_time, n_op;
+    int *a, *b, *c, *e, *i1, *i2, *ip;
+    double *d1, *d2, *d3, *w;
+    time_t t_1, t_2;
+    FILE *f_log, *f_out;
+    
+    f_log = fopen("pi.log", "w");
+    printf("PI calculation to estimate the FFT benchmarks\n");
+    fprintf(f_log, "PI calculation to estimate the FFT benchmarks\n");
+    printf("length of FFT =?\n");
+    scanf("%d", &nfft);
+    
+    printf("initializing...\n");
+    for (log2_nfft = 1; (1 << log2_nfft) < nfft; log2_nfft++);
+    nfft = 1 << log2_nfft;
+    n = nfft + 2;
+    ip = (int *) malloc((3 + (int) sqrt(0.5 * nfft)) * sizeof(int));
+    w = (double *) malloc(nfft / 2 * sizeof(double));
+    a = (int *) malloc((n + 2) * sizeof(int));
+    b = (int *) malloc((n + 2) * sizeof(int));
+    c = (int *) malloc((n + 2) * sizeof(int));
+    e = (int *) malloc((n + 2) * sizeof(int));
+    i1 = (int *) malloc((n + 2) * sizeof(int));
+    i2 = (int *) malloc((n + 2) * sizeof(int));
+    d1 = (double *) malloc((nfft + 2) * sizeof(double));
+    d2 = (double *) malloc((nfft + 2) * sizeof(double));
+    d3 = (double *) malloc((nfft + 2) * sizeof(double));
+    if (d3 == NULL) {
+        printf("Allocation Failure!\n");
+        exit(1);
+    }
+    ip[0] = 0;
+    /* ---- radix test ---- */
+    log10_radix = 1;
+    radix = 10;
+    err = mp_mul_radix_test(n, radix, nfft, d1, ip, w);
+    err += DBL_EPSILON * (n * radix * radix / 4);
+    while (100 * err < DBL_ERROR_MARGIN && radix <= INT_MAX / 20) {
+        err *= 100;
+        log10_radix++;
+        radix *= 10;
+    }
+    printf("nfft= %d\nradix= %d\nerror_margin= %g\n", nfft, radix, err);
+    fprintf(f_log, "nfft= %d\nradix= %d\nerror_margin= %g\n", nfft, radix, err);
+    printf("calculating %d digits of PI...\n", log10_radix * (n - 2));
+    fprintf(f_log, "calculating %d digits of PI...\n", log10_radix * (n - 2));
+    /* ---- time check ---- */
+    time(&t_1);
+    /*
+     * ---- a formula based on the AGM (Arithmetic-Geometric Mean) ----
+     *   c = sqrt(0.125);
+     *   a = 1 + 3 * c;
+     *   b = sqrt(a);
+     *   e = b - 0.625;
+     *   b = 2 * b;
+     *   c = e - c;
+     *   a = a + e;
+     *   npow = 4;
+     *   do {
+     *       npow = 2 * npow;
+     *       e = (a + b) / 2;
+     *       b = sqrt(a * b);
+     *       e = e - b;
+     *       b = 2 * b;
+     *       c = c - e;
+     *       a = e + b;
+     *   } while (e > SQRT_SQRT_EPSILON);
+     *   e = e * e / 4;
+     *   a = a + b;
+     *   pi = (a * a - e - e / 2) / (a * c - e) / npow;
+     * ---- modification ----
+     *   This is a modified version of Gauss-Legendre formula
+     *   (by T.Ooura). It is faster than original version.
+     * ---- reference ----
+     *   1. E.Salamin, 
+     *      Computation of PI Using Arithmetic-Geometric Mean, 
+     *      Mathematics of Computation, Vol.30 1976.
+     *   2. R.P.Brent, 
+     *      Fast Multiple-Precision Evaluation of Elementary Functions, 
+     *      J. ACM 23 1976.
+     *   3. D.Takahasi, Y.Kanada, 
+     *      Calculation of PI to 51.5 Billion Decimal Digits on 
+     *      Distributed Memoriy Parallel Processors, 
+     *      Transactions of Information Processing Society of Japan, 
+     *      Vol.39 No.7 1998.
+     *   4. T.Ooura, 
+     *      Improvement of the PI Calculation Algorithm and 
+     *      Implementation of Fast Multiple-Precision Computation, 
+     *      Information Processing Society of Japan SIG Notes, 
+     *      98-HPC-74, 1998.
+     */
+    /* ---- c = sqrt(0.125) ---- */
+    mp_sscanf(n, log10_radix, "0.125", a);
+    mp_sqrt(n, radix, a, c, i1, i2, nfft, d1, d2, ip, w);
+    /* ---- a = 1 + 3 * c ---- */
+    mp_imul(n, radix, c, 3, e);
+    mp_sscanf(n, log10_radix, "1", a);
+    mp_add(n, radix, a, e, a);
+    /* ---- b = sqrt(a) ---- */
+    mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2, ip, w);
+    /* ---- e = b - 0.625 ---- */
+    mp_sscanf(n, log10_radix, "0.625", e);
+    mp_sub(n, radix, b, e, e);
+    /* ---- b = 2 * b ---- */
+    mp_add(n, radix, b, b, b);
+    /* ---- c = e - c ---- */
+    mp_sub(n, radix, e, c, c);
+    /* ---- a = a + e ---- */
+    mp_add(n, radix, a, e, a);
+    printf("AGM iteration\n");
+    fprintf(f_log, "AGM iteration\n");
+    npow = 4;
+    do {
+        npow *= 2;
+        /* ---- e = (a + b) / 2 ---- */
+        mp_add(n, radix, a, b, e);
+        mp_idiv_2(n, radix, e, e);
+        /* ---- b = sqrt(a * b) ---- */
+        mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w);
+        mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2, ip, w);
+        /* ---- e = e - b ---- */
+        mp_sub(n, radix, e, b, e);
+        /* ---- b = 2 * b ---- */
+        mp_add(n, radix, b, b, b);
+        /* ---- c = c - e ---- */
+        mp_sub(n, radix, c, e, c);
+        /* ---- a = e + b ---- */
+        mp_add(n, radix, e, b, a);
+        /* ---- convergence check ---- */
+        nprc = -e[1];
+        if (e[0] == 0) {
+            nprc = n;
+        }
+        printf("precision= %d\n", 4 * nprc * log10_radix);
+        fprintf(f_log, "precision= %d\n", 4 * nprc * log10_radix);
+    } while (4 * nprc <= n);
+    /* ---- e = e * e / 4 (half precision) ---- */
+    mp_idiv_2(n, radix, e, e);
+    mp_squh(n, radix, e, e, nfft, d1, ip, w);
+    /* ---- a = a + b ---- */
+    mp_add(n, radix, a, b, a);
+    /* ---- a = (a * a - e - e / 2) / (a * c - e) / npow ---- */
+    mp_mul(n, radix, a, c, c, i1, nfft, d1, d2, d3, ip, w);
+    mp_sub(n, radix, c, e, c);
+    mp_inv(n, radix, c, b, i1, i2, nfft, d1, d2, ip, w);
+    mp_squ(n, radix, a, a, i1, nfft, d1, d2, ip, w);
+    mp_sub(n, radix, a, e, a);
+    mp_idiv_2(n, radix, e, e);
+    mp_sub(n, radix, a, e, a);
+    mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w);
+    mp_idiv(n, radix, a, npow, a);
+    /* ---- time check ---- */
+    time(&t_2);
+    /* ---- output ---- */
+    f_out = fopen("pi.dat", "w");
+    printf("writing pi.dat...\n");
+    mp_fprintf(n - 1, log10_radix, a, f_out);
+    fclose(f_out);
+    free(d3);
+    free(d2);
+    free(d1);
+    free(i2);
+    free(i1);
+    free(e);
+    free(c);
+    free(b);
+    free(a);
+    free(w);
+    free(ip);
+    /* ---- benchmark ---- */
+    n_op = 50.0 * nfft * log2_nfft * log2_nfft;
+    printf("floating point operation: %g op.\n", n_op);
+    fprintf(f_log, "floating point operation: %g op.\n", n_op);
+    /* ---- difftime ---- */
+    d_time = difftime(t_2, t_1);
+    printf("execution time: %g sec. (real time)\n", d_time);
+    fprintf(f_log, "execution time: %g sec. (real time)\n", d_time);
+    fclose(f_log);
+    return 0;
+}
+
+
+/* -------- multiple precision routines -------- */
+
+
+#include <math.h>
+#include <float.h>
+#include <stdio.h>
+
+/* ---- floating point format ----
+    data := data[0] * pow(radix, data[1]) * 
+            (data[2] + data[3]/radix + data[4]/radix/radix + ...), 
+    data[0]       : sign (1;data>0, -1;data<0, 0;data==0)
+    data[1]       : exponent (0;data==0)
+    data[2...n+1] : digits
+   ---- function prototypes ----
+    void mp_load_0(int n, int radix, int out[]);
+    void mp_load_1(int n, int radix, int out[]);
+    void mp_copy(int n, int radix, int in[], int out[]);
+    void mp_round(int n, int radix, int m, int inout[]);
+    int mp_cmp(int n, int radix, int in1[], int in2[]);
+    void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_imul(int n, int radix, int in1[], int in2, int out[]);
+    int mp_idiv(int n, int radix, int in1[], int in2, int out[]);
+    void mp_idiv_2(int n, int radix, int in[], int out[]);
+    double mp_mul_radix_test(int n, int radix, int nfft, 
+            double tmpfft[], int ip[], double w[]);
+    void mp_mul(int n, int radix, int in1[], int in2[], int out[], 
+            int tmp[], int nfft, double tmp1fft[], double tmp2fft[], 
+            double tmp3fft[], int ip[], double w[]);
+    void mp_squ(int n, int radix, int in[], int out[], int tmp[], 
+            int nfft, double tmp1fft[], double tmp2fft[], 
+            int ip[], double w[]);
+    void mp_mulh(int n, int radix, int in1[], int in2[], int out[], 
+            int nfft, double in1fft[], double outfft[], 
+            int ip[], double w[]);
+    void mp_squh(int n, int radix, int in[], int out[], 
+            int nfft, double inoutfft[], int ip[], double w[]);
+    int mp_inv(int n, int radix, int in[], int out[], 
+            int tmp1[], int tmp2[], int nfft, 
+            double tmp1fft[], double tmp2fft[], int ip[], double w[]);
+    int mp_sqrt(int n, int radix, int in[], int out[], 
+            int tmp1[], int tmp2[], int nfft, 
+            double tmp1fft[], double tmp2fft[], int ip[], double w[]);
+    void mp_sprintf(int n, int log10_radix, int in[], char out[]);
+    void mp_sscanf(int n, int log10_radix, char in[], int out[]);
+    void mp_fprintf(int n, int log10_radix, int in[], FILE *fout);
+   ----
+*/
+
+
+/* -------- mp_load routines -------- */
+
+
+void mp_load_0(int n, int radix, int out[])
+{
+    int j;
+    
+    for (j = 0; j <= n + 1; j++) {
+        out[j] = 0;
+    }
+}
+
+
+void mp_load_1(int n, int radix, int out[])
+{
+    int j;
+    
+    out[0] = 1;
+    out[1] = 0;
+    out[2] = 1;
+    for (j = 3; j <= n + 1; j++) {
+        out[j] = 0;
+    }
+}
+
+
+void mp_copy(int n, int radix, int in[], int out[])
+{
+    int j;
+    
+    for (j = 0; j <= n + 1; j++) {
+        out[j] = in[j];
+    }
+}
+
+
+void mp_round(int n, int radix, int m, int inout[])
+{
+    int j, x;
+    
+    if (m < n) {
+        for (j = n + 1; j > m + 2; j--) {
+            inout[j] = 0;
+        }
+        x = 2 * inout[m + 2];
+        inout[m + 2] = 0;
+        if (x >= radix) {
+            for (j = m + 1; j >= 2; j--) {
+                x = inout[j] + 1;
+                if (x < radix) {
+                    inout[j] = x;
+                    break;
+                }
+                inout[j] = 0;
+            }
+            if (x >= radix) {
+                inout[2] = 1;
+                inout[1]++;
+            }
+        }
+    }
+}
+
+
+/* -------- mp_add routines -------- */
+
+
+int mp_cmp(int n, int radix, int in1[], int in2[])
+{
+    int mp_unsgn_cmp(int n, int in1[], int in2[]);
+    
+    if (in1[0] > in2[0]) {
+        return 1;
+    } else if (in1[0] < in2[0]) {
+        return -1;
+    }
+    return in1[0] * mp_unsgn_cmp(n, &in1[1], &in2[1]);
+}
+
+
+void mp_add(int n, int radix, int in1[], int in2[], int out[])
+{
+    int mp_unsgn_cmp(int n, int in1[], int in2[]);
+    int mp_unexp_add(int n, int radix, int expdif, 
+            int in1[], int in2[], int out[]);
+    int mp_unexp_sub(int n, int radix, int expdif, 
+            int in1[], int in2[], int out[]);
+    int outsgn, outexp, expdif;
+    
+    expdif = in1[1] - in2[1];
+    outexp = in1[1];
+    if (expdif < 0) {
+        outexp = in2[1];
+    }
+    outsgn = in1[0] * in2[0];
+    if (outsgn >= 0) {
+        if (outsgn > 0) {
+            outsgn = in1[0];
+        } else {
+            outsgn = in1[0] + in2[0];
+            outexp = in1[1] + in2[1];
+            expdif = 0;
+        }
+        if (expdif >= 0) {
+            outexp += mp_unexp_add(n, radix, expdif, 
+                    &in1[2], &in2[2], &out[2]);
+        } else {
+            outexp += mp_unexp_add(n, radix, -expdif, 
+                    &in2[2], &in1[2], &out[2]);
+        }
+    } else {
+        outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]);
+        if (outsgn >= 0) {
+            expdif = mp_unexp_sub(n, radix, expdif, 
+                    &in1[2], &in2[2], &out[2]);
+        } else {
+            expdif = mp_unexp_sub(n, radix, -expdif, 
+                    &in2[2], &in1[2], &out[2]);
+        }
+        outexp -= expdif;
+        outsgn *= in1[0];
+        if (expdif == n) {
+            outsgn = 0;
+        }
+    }
+    if (outsgn == 0) {
+        outexp = 0;
+    }
+    out[0] = outsgn;
+    out[1] = outexp;
+}
+
+
+void mp_sub(int n, int radix, int in1[], int in2[], int out[])
+{
+    int mp_unsgn_cmp(int n, int in1[], int in2[]);
+    int mp_unexp_add(int n, int radix, int expdif, 
+            int in1[], int in2[], int out[]);
+    int mp_unexp_sub(int n, int radix, int expdif, 
+            int in1[], int in2[], int out[]);
+    int outsgn, outexp, expdif;
+    
+    expdif = in1[1] - in2[1];
+    outexp = in1[1];
+    if (expdif < 0) {
+        outexp = in2[1];
+    }
+    outsgn = in1[0] * in2[0];
+    if (outsgn <= 0) {
+        if (outsgn < 0) {
+            outsgn = in1[0];
+        } else {
+            outsgn = in1[0] - in2[0];
+            outexp = in1[1] + in2[1];
+            expdif = 0;
+        }
+        if (expdif >= 0) {
+            outexp += mp_unexp_add(n, radix, expdif, 
+                    &in1[2], &in2[2], &out[2]);
+        } else {
+            outexp += mp_unexp_add(n, radix, -expdif, 
+                    &in2[2], &in1[2], &out[2]);
+        }
+    } else {
+        outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]);
+        if (outsgn >= 0) {
+            expdif = mp_unexp_sub(n, radix, expdif, 
+                    &in1[2], &in2[2], &out[2]);
+        } else {
+            expdif = mp_unexp_sub(n, radix, -expdif, 
+                    &in2[2], &in1[2], &out[2]);
+        }
+        outexp -= expdif;
+        outsgn *= in1[0];
+        if (expdif == n) {
+            outsgn = 0;
+        }
+    }
+    if (outsgn == 0) {
+        outexp = 0;
+    }
+    out[0] = outsgn;
+    out[1] = outexp;
+}
+
+
+/* -------- mp_add child routines -------- */
+
+
+int mp_unsgn_cmp(int n, int in1[], int in2[])
+{
+    int j, cmp;
+    
+    cmp = 0;
+    for (j = 0; j <= n && cmp == 0; j++) {
+        cmp = in1[j] - in2[j];
+    }
+    if (cmp > 0) {
+        cmp = 1;
+    } else if (cmp < 0) {
+        cmp = -1;
+    }
+    return cmp;
+}
+
+
+int mp_unexp_add(int n, int radix, int expdif, 
+        int in1[], int in2[], int out[])
+{
+    int j, x, carry;
+    
+    carry = 0;
+    if (expdif == 0 && in1[0] + in2[0] >= radix) {
+        x = in1[n - 1] + in2[n - 1];
+        carry = x >= radix ? -1 : 0;
+        for (j = n - 1; j > 0; j--) {
+            x = in1[j - 1] + in2[j - 1] - carry;
+            carry = x >= radix ? -1 : 0;
+            out[j] = x - (radix & carry);
+        }
+        out[0] = -carry;
+    } else {
+        if (expdif > n) {
+            expdif = n;
+        }
+        for (j = n - 1; j >= expdif; j--) {
+            x = in1[j] + in2[j - expdif] - carry;
+            carry = x >= radix ? -1 : 0;
+            out[j] = x - (radix & carry);
+        }
+        for (j = expdif - 1; j >= 0; j--) {
+            x = in1[j] - carry;
+            carry = x >= radix ? -1 : 0;
+            out[j] = x - (radix & carry);
+        }
+        if (carry != 0) {
+            for (j = n - 1; j > 0; j--) {
+                out[j] = out[j - 1];
+            }
+            out[0] = -carry;
+        }
+    }
+    return -carry;
+}
+
+
+int mp_unexp_sub(int n, int radix, int expdif, 
+        int in1[], int in2[], int out[])
+{
+    int j, x, borrow, ncancel;
+    
+    if (expdif > n) {
+        expdif = n;
+    }
+    borrow = 0;
+    for (j = n - 1; j >= expdif; j--) {
+        x = in1[j] - in2[j - expdif] + borrow;
+        borrow = x < 0 ? -1 : 0;
+        out[j] = x + (radix & borrow);
+    }
+    for (j = expdif - 1; j >= 0; j--) {
+        x = in1[j] + borrow;
+        borrow = x < 0 ? -1 : 0;
+        out[j] = x + (radix & borrow);
+    }
+    ncancel = 0;
+    for (j = 0; j < n && out[j] == 0; j++) {
+        ncancel = j + 1;
+    }
+    if (ncancel > 0 && ncancel < n) {
+        for (j = 0; j < n - ncancel; j++) {
+            out[j] = out[j + ncancel];
+        }
+        for (j = n - ncancel; j < n; j++) {
+            out[j] = 0;
+        }
+    }
+    return ncancel;
+}
+
+
+/* -------- mp_imul routines -------- */
+
+
+void mp_imul(int n, int radix, int in1[], int in2, int out[])
+{
+    void mp_unsgn_imul(int n, double dradix, int in1[], double din2, 
+            int out[]);
+    
+    if (in2 > 0) {
+        out[0] = in1[0];
+    } else if (in2 < 0) {
+        out[0] = -in1[0];
+        in2 = -in2;
+    } else {
+        out[0] = 0;
+    }
+    mp_unsgn_imul(n, radix, &in1[1], in2, &out[1]);
+    if (out[0] == 0) {
+        out[1] = 0;
+    }
+}
+
+
+int mp_idiv(int n, int radix, int in1[], int in2, int out[])
+{
+    void mp_load_0(int n, int radix, int out[]);
+    void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, 
+            int out[]);
+    
+    if (in2 == 0) {
+        return -1;
+    }
+    if (in2 > 0) {
+        out[0] = in1[0];
+    } else {
+        out[0] = -in1[0];
+        in2 = -in2;
+    }
+    if (in1[0] == 0) {
+        mp_load_0(n, radix, out);
+        return 0;
+    }
+    mp_unsgn_idiv(n, radix, &in1[1], in2, &out[1]);
+    return 0;
+}
+
+
+void mp_idiv_2(int n, int radix, int in[], int out[])
+{
+    int j, ix, carry, shift;
+    
+    out[0] = in[0];
+    shift = 0;
+    if (in[2] == 1) {
+        shift = 1;
+    }
+    out[1] = in[1] - shift;
+    carry = -shift;
+    for (j = 2; j <= n + 1 - shift; j++) {
+        ix = in[j + shift] + (radix & carry);
+        carry = -(ix & 1);
+        out[j] = ix >> 1;
+    }
+    if (shift > 0) {
+        out[n + 1] = (radix & carry) >> 1;
+    }
+}
+
+
+/* -------- mp_imul child routines -------- */
+
+
+void mp_unsgn_imul(int n, double dradix, int in1[], double din2, 
+        int out[])
+{
+    int j, carry, shift;
+    double x, d1_radix;
+    
+    d1_radix = 1.0 / dradix;
+    carry = 0;
+    for (j = n; j >= 1; j--) {
+        x = din2 * in1[j] + carry + 0.5;
+        carry = (int) (d1_radix * x);
+        out[j] = (int) (x - dradix * carry);
+    }
+    shift = 0;
+    x = carry + 0.5;
+    while (x > 1) {
+        x *= d1_radix;
+        shift++;
+    }
+    out[0] = in1[0] + shift;
+    if (shift > 0) {
+        while (shift > n) {
+            carry = (int) (d1_radix * carry + 0.5);
+            shift--;
+        }
+        for (j = n; j >= shift + 1; j--) {
+            out[j] = out[j - shift];
+        }
+        for (j = shift; j >= 1; j--) {
+            x = carry + 0.5;
+            carry = (int) (d1_radix * x);
+            out[j] = (int) (x - dradix * carry);
+        }
+    }
+}
+
+
+void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, 
+        int out[])
+{
+    int j, ix, carry, shift;
+    double x, d1_in2;
+    
+    d1_in2 = 1.0 / din2;
+    shift = 0;
+    x = 0;
+    do {
+        shift++;
+        x *= dradix;
+        if (shift <= n) {
+            x += in1[shift];
+        }
+    } while (x < din2 - 0.5);
+    x += 0.5;
+    ix = (int) (d1_in2 * x);
+    carry = (int) (x - din2 * ix);
+    out[1] = ix;
+    shift--;
+    out[0] = in1[0] - shift;
+    if (shift >= n) {
+        shift = n - 1;
+    }
+    for (j = 2; j <= n - shift; j++) {
+        x = in1[j + shift] + dradix * carry + 0.5;
+        ix = (int) (d1_in2 * x);
+        carry = (int) (x - din2 * ix);
+        out[j] = ix;
+    }
+    for (j = n - shift + 1; j <= n; j++) {
+        x = dradix * carry + 0.5;
+        ix = (int) (d1_in2 * x);
+        carry = (int) (x - din2 * ix);
+        out[j] = ix;
+    }
+}
+
+
+/* -------- mp_mul routines -------- */
+
+
+double mp_mul_radix_test(int n, int radix, int nfft, 
+        double tmpfft[], int ip[], double w[])
+{
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_csqu(int nfft, double dinout[]);
+    double mp_mul_d2i_test(int radix, int nfft, double din[]);
+    int j, ndata, radix_2;
+    
+    ndata = (nfft >> 1) + 1;
+    if (ndata > n) {
+        ndata = n;
+    }
+    tmpfft[nfft + 1] = radix - 1;
+    for (j = nfft; j > ndata; j--) {
+        tmpfft[j] = 0;
+    }
+    radix_2 = (radix + 1) / 2;
+    for (j = ndata; j > 2; j--) {
+        tmpfft[j] = radix_2;
+    }
+    tmpfft[2] = radix;
+    tmpfft[1] = radix - 1;
+    tmpfft[0] = 0;
+    rdft(nfft, 1, &tmpfft[1], ip, w);
+    mp_mul_csqu(nfft, tmpfft);
+    rdft(nfft, -1, &tmpfft[1], ip, w);
+    return 2 * mp_mul_d2i_test(radix, nfft, tmpfft);
+}
+
+
+void mp_mul(int n, int radix, int in1[], int in2[], int out[], 
+        int tmp[], int nfft, double tmp1fft[], double tmp2fft[], 
+        double tmp3fft[], int ip[], double w[])
+{
+    void mp_copy(int n, int radix, int in[], int out[]);
+    void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+            int in[], double dout[]);
+    void mp_mul_cmul(int nfft, double din[], double dinout[]);
+    void mp_mul_cmuladd(int nfft, double din1[], double din2[], 
+            double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    int n_h, shift;
+    
+    shift = (nfft >> 1) + 1;
+    while (n > shift) {
+        if (in1[shift + 2] + in2[shift + 2] != 0) {
+            break;
+        }
+        shift++;
+    }
+    n_h = n / 2 + 1;
+    if (n_h < n - shift) {
+        n_h = n - shift;
+    }
+    /* ---- tmp3fft = (upper) in1 * (lower) in2 ---- */
+    mp_mul_i2d(n, radix, nfft, 0, in1, tmp1fft);
+    rdft(nfft, 1, &tmp1fft[1], ip, w);
+    mp_mul_i2d(n, radix, nfft, shift, in2, tmp3fft);
+    rdft(nfft, 1, &tmp3fft[1], ip, w);
+    mp_mul_cmul(nfft, tmp1fft, tmp3fft);
+    /* ---- tmp = (upper) in1 * (upper) in2 ---- */
+    mp_mul_i2d(n, radix, nfft, 0, in2, tmp2fft);
+    rdft(nfft, 1, &tmp2fft[1], ip, w);
+    mp_mul_cmul(nfft, tmp2fft, tmp1fft);
+    rdft(nfft, -1, &tmp1fft[1], ip, w);
+    mp_mul_d2i(n, radix, nfft, tmp1fft, tmp);
+    /* ---- tmp3fft += (upper) in2 * (lower) in1 ---- */
+    mp_mul_i2d(n, radix, nfft, shift, in1, tmp1fft);
+    rdft(nfft, 1, &tmp1fft[1], ip, w);
+    mp_mul_cmuladd(nfft, tmp1fft, tmp2fft, tmp3fft);
+    /* ---- out = tmp + tmp3fft ---- */
+    rdft(nfft, -1, &tmp3fft[1], ip, w);
+    mp_mul_d2i(n_h, radix, nfft, tmp3fft, out);
+    if (out[0] != 0) {
+        mp_add(n, radix, out, tmp, out);
+    } else {
+        mp_copy(n, radix, tmp, out);
+    }
+}
+
+
+void mp_squ(int n, int radix, int in[], int out[], int tmp[], 
+        int nfft, double tmp1fft[], double tmp2fft[], 
+        int ip[], double w[])
+{
+    void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+            int in[], double dout[]);
+    void mp_mul_cmul(int nfft, double din[], double dinout[]);
+    void mp_mul_csqu(int nfft, double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    int n_h, shift;
+    
+    shift = (nfft >> 1) + 1;
+    while (n > shift) {
+        if (in[shift + 2] != 0) {
+            break;
+        }
+        shift++;
+    }
+    n_h = n / 2 + 1;
+    if (n_h < n - shift) {
+        n_h = n - shift;
+    }
+    /* ---- tmp = (upper) in * (lower) in ---- */
+    mp_mul_i2d(n, radix, nfft, 0, in, tmp1fft);
+    rdft(nfft, 1, &tmp1fft[1], ip, w);
+    mp_mul_i2d(n, radix, nfft, shift, in, tmp2fft);
+    rdft(nfft, 1, &tmp2fft[1], ip, w);
+    mp_mul_cmul(nfft, tmp1fft, tmp2fft);
+    rdft(nfft, -1, &tmp2fft[1], ip, w);
+    mp_mul_d2i(n_h, radix, nfft, tmp2fft, tmp);
+    /* ---- out = 2 * tmp + ((upper) in)^2 ---- */
+    mp_mul_csqu(nfft, tmp1fft);
+    rdft(nfft, -1, &tmp1fft[1], ip, w);
+    mp_mul_d2i(n, radix, nfft, tmp1fft, out);
+    if (tmp[0] != 0) {
+        mp_add(n_h, radix, tmp, tmp, tmp);
+        mp_add(n, radix, out, tmp, out);
+    }
+}
+
+
+void mp_mulh(int n, int radix, int in1[], int in2[], int out[], 
+        int nfft, double in1fft[], double outfft[], int ip[], double w[])
+{
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+            int in[], double dout[]);
+    void mp_mul_cmul(int nfft, double din[], double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    
+    mp_mul_i2d(n, radix, nfft, 0, in1, in1fft);
+    rdft(nfft, 1, &in1fft[1], ip, w);
+    mp_mul_i2d(n, radix, nfft, 0, in2, outfft);
+    rdft(nfft, 1, &outfft[1], ip, w);
+    mp_mul_cmul(nfft, in1fft, outfft);
+    rdft(nfft, -1, &outfft[1], ip, w);
+    mp_mul_d2i(n, radix, nfft, outfft, out);
+}
+
+
+void mp_mulh_use_in1fft(int n, int radix, double in1fft[], 
+        int shift, int in2[], int out[], int nfft, double outfft[], 
+        int ip[], double w[])
+{
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+            int in[], double dout[]);
+    void mp_mul_cmul(int nfft, double din[], double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    int n_h;
+    
+    while (n > shift) {
+        if (in2[shift + 2] != 0) {
+            break;
+        }
+        shift++;
+    }
+    n_h = n / 2 + 1;
+    if (n_h < n - shift) {
+        n_h = n - shift;
+    }
+    mp_mul_i2d(n, radix, nfft, shift, in2, outfft);
+    rdft(nfft, 1, &outfft[1], ip, w);
+    mp_mul_cmul(nfft, in1fft, outfft);
+    rdft(nfft, -1, &outfft[1], ip, w);
+    mp_mul_d2i(n_h, radix, nfft, outfft, out);
+}
+
+
+void mp_squh(int n, int radix, int in[], int out[], 
+        int nfft, double inoutfft[], int ip[], double w[])
+{
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+            int in[], double dout[]);
+    void mp_mul_csqu(int nfft, double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    
+    mp_mul_i2d(n, radix, nfft, 0, in, inoutfft);
+    rdft(nfft, 1, &inoutfft[1], ip, w);
+    mp_mul_csqu(nfft, inoutfft);
+    rdft(nfft, -1, &inoutfft[1], ip, w);
+    mp_mul_d2i(n, radix, nfft, inoutfft, out);
+}
+
+
+void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], 
+        int nfft, int ip[], double w[])
+{
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_csqu(int nfft, double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    
+    mp_mul_csqu(nfft, inoutfft);
+    rdft(nfft, -1, &inoutfft[1], ip, w);
+    mp_mul_d2i(n, radix, nfft, inoutfft, out);
+}
+
+
+/* -------- mp_mul child routines -------- */
+
+
+void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+        int in[], double dout[])
+{
+    int j, x, carry, ndata, radix_2, topdgt;
+    
+    ndata = 0;
+    topdgt = 0;
+    if (n > shift) {
+        topdgt = in[shift + 2];
+        ndata = (nfft >> 1) + 1;
+        if (ndata > n - shift) {
+            ndata = n - shift;
+        }
+    }
+    dout[nfft + 1] = in[0] * topdgt;
+    for (j = nfft; j > ndata; j--) {
+        dout[j] = 0;
+    }
+    /* ---- abs(dout[j]) <= radix/2 (to keep FFT precision) ---- */
+    if (ndata > 1) {
+        radix_2 = radix / 2;
+        carry = 0;
+        for (j = ndata + 1; j > 3; j--) {
+            x = in[j + shift] - carry;
+            carry = x >= radix_2 ? -1 : 0;
+            dout[j - 1] = x - (radix & carry);
+        }
+        dout[2] = in[shift + 3] - carry;
+    }
+    dout[1] = topdgt;
+    dout[0] = in[1] - shift;
+}
+
+
+void mp_mul_cmul(int nfft, double din[], double dinout[])
+{
+    int j;
+    double xr, xi, yr, yi;
+    
+    dinout[0] += din[0];
+    dinout[1] *= din[1];
+    dinout[2] *= din[2];
+    for (j = 3; j < nfft; j += 2) {
+        xr = din[j];
+        xi = din[j + 1];
+        yr = dinout[j];
+        yi = dinout[j + 1];
+        dinout[j] = xr * yr - xi * yi;
+        dinout[j + 1] = xr * yi + xi * yr;
+    }
+    dinout[nfft + 1] *= din[nfft + 1];
+}
+
+
+void mp_mul_cmuladd(int nfft, double din1[], double din2[], 
+        double dinout[])
+{
+    int j;
+    double xr, xi, yr, yi;
+    
+    dinout[1] += din1[1] * din2[1];
+    dinout[2] += din1[2] * din2[2];
+    for (j = 3; j < nfft; j += 2) {
+        xr = din1[j];
+        xi = din1[j + 1];
+        yr = din2[j];
+        yi = din2[j + 1];
+        dinout[j] += xr * yr - xi * yi;
+        dinout[j + 1] += xr * yi + xi * yr;
+    }
+    dinout[nfft + 1] += din1[nfft + 1] * din2[nfft + 1];
+}
+
+
+void mp_mul_csqu(int nfft, double dinout[])
+{
+    int j;
+    double xr, xi;
+    
+    dinout[0] *= 2;
+    dinout[1] *= dinout[1];
+    dinout[2] *= dinout[2];
+    for (j = 3; j < nfft; j += 2) {
+        xr = dinout[j];
+        xi = dinout[j + 1];
+        dinout[j] = xr * xr - xi * xi;
+        dinout[j + 1] = 2 * xr * xi;
+    }
+    dinout[nfft + 1] *= dinout[nfft + 1];
+}
+
+
+void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[])
+{
+    int j, carry, carry1, carry2, shift, ndata;
+    double x, scale, d1_radix, d1_radix2, pow_radix, topdgt;
+    
+    scale = 2.0 / nfft;
+    d1_radix = 1.0 / radix;
+    d1_radix2 = d1_radix * d1_radix;
+    topdgt = din[nfft + 1];
+    x = topdgt < 0 ? -topdgt : topdgt;
+    shift = x + 0.5 >= radix ? 1 : 0;
+    /* ---- correction of cyclic convolution of din[1] ---- */
+    x *= nfft * 0.5;
+    din[nfft + 1] = din[1] - x;
+    din[1] = x;
+    /* ---- output of digits ---- */
+    ndata = n;
+    if (n > nfft + 1 + shift) {
+        ndata = nfft + 1 + shift;
+        for (j = n + 1; j > ndata + 1; j--) {
+            out[j] = 0;
+        }
+    }
+    x = 0;
+    pow_radix = 1;
+    for (j = ndata + 1 - shift; j <= nfft + 1; j++) {
+        x += pow_radix * din[j];
+        pow_radix *= d1_radix;
+        if (pow_radix < DBL_EPSILON) {
+            break;
+        }
+    }
+    x = d1_radix2 * (scale * x + 0.5);
+    carry2 = ((int) x) - 1;
+    carry = (int) (radix * (x - carry2) + 0.5);
+    for (j = ndata; j > 1; j--) {
+        x = d1_radix2 * (scale * din[j - shift] + carry + 0.5);
+        carry = carry2;
+        carry2 = ((int) x) - 1;
+        x = radix * (x - carry2);
+        carry1 = (int) x;
+        out[j + 1] = (int) (radix * (x - carry1));
+        carry += carry1;
+    }
+    x = carry + ((double) radix) * carry2 + 0.5;
+    if (shift == 0) {
+        x += scale * din[1];
+    }
+    carry = (int) (d1_radix * x);
+    out[2] = (int) (x - ((double) radix) * carry);
+    if (carry > 0) {
+        for (j = n + 1; j > 2; j--) {
+            out[j] = out[j - 1];
+        }
+        out[2] = carry;
+        shift++;
+    }
+    /* ---- output of exp, sgn ---- */
+    x = din[0] + shift + 0.5;
+    shift = ((int) x) - 1;
+    out[1] = shift + ((int) (x - shift));
+    out[0] = topdgt > 0.5 ? 1 : -1;
+    if (out[2] == 0) {
+        out[0] = 0;
+        out[1] = 0;
+    }
+}
+
+
+double mp_mul_d2i_test(int radix, int nfft, double din[])
+{
+    int j, carry, carry1, carry2;
+    double x, scale, d1_radix, d1_radix2, err;
+    
+    scale = 2.0 / nfft;
+    d1_radix = 1.0 / radix;
+    d1_radix2 = d1_radix * d1_radix;
+    /* ---- correction of cyclic convolution of din[1] ---- */
+    x = din[nfft + 1] * nfft * 0.5;
+    if (x < 0) {
+        x = -x;
+    }
+    din[nfft + 1] = din[1] - x;
+    /* ---- check of digits ---- */
+    err = 0;
+    carry = 0;
+    carry2 = 0;
+    for (j = nfft + 1; j > 1; j--) {
+        x = d1_radix2 * (scale * din[j] + carry + 0.5);
+        carry = carry2;
+        carry2 = ((int) x) - 1;
+        x = radix * (x - carry2);
+        carry1 = (int) x;
+        x = radix * (x - carry1);
+        carry += carry1;
+        x = x - 0.5 - ((int) x);
+        if (x > err) {
+            err = x;
+        } else if (-x > err) {
+            err = -x;
+        }
+    }
+    return err;
+}
+
+
+/* -------- mp_inv routines -------- */
+
+
+int mp_inv(int n, int radix, int in[], int out[], 
+        int tmp1[], int tmp2[], int nfft, 
+        double tmp1fft[], double tmp2fft[], int ip[], double w[])
+{
+    int mp_get_nfft_init(int radix, int nfft_max);
+    void mp_inv_init(int n, int radix, int in[], int out[]);
+    int mp_inv_newton(int n, int radix, int in[], int inout[], 
+            int tmp1[], int tmp2[], int nfft, double tmp1fft[], 
+            double tmp2fft[], int ip[], double w[]);
+    int n_nwt, nfft_nwt, thr, prc;
+    
+    if (in[0] == 0) {
+        return -1;
+    }
+    nfft_nwt = mp_get_nfft_init(radix, nfft);
+    n_nwt = nfft_nwt + 2;
+    if (n_nwt > n) {
+        n_nwt = n;
+    }
+    mp_inv_init(n_nwt, radix, in, out);
+    thr = 8;
+    do {
+        n_nwt = nfft_nwt + 2;
+        if (n_nwt > n) {
+            n_nwt = n;
+        }
+        prc = mp_inv_newton(n_nwt, radix, in, out, 
+                tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft, ip, w);
+        if (thr * nfft_nwt >= nfft) {
+            thr = 0;
+            if (2 * prc <= n_nwt - 2) {
+                nfft_nwt >>= 1;
+            }
+        } else {
+            if (3 * prc < n_nwt - 2) {
+                nfft_nwt >>= 1;
+            }
+        }
+        nfft_nwt <<= 1;
+    } while (nfft_nwt <= nfft);
+    return 0;
+}
+
+
+int mp_sqrt(int n, int radix, int in[], int out[], 
+        int tmp1[], int tmp2[], int nfft, 
+        double tmp1fft[], double tmp2fft[], int ip[], double w[])
+{
+    void mp_load_0(int n, int radix, int out[]);
+    int mp_get_nfft_init(int radix, int nfft_max);
+    void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[]);
+    int mp_sqrt_newton(int n, int radix, int in[], int inout[], 
+            int inout_rev[], int tmp[], int nfft, double tmp1fft[], 
+            double tmp2fft[], int ip[], double w[], int *n_tmp1fft);
+    int n_nwt, nfft_nwt, thr, prc, n_tmp1fft;
+    
+    if (in[0] < 0) {
+        return -1;
+    } else if (in[0] == 0) {
+        mp_load_0(n, radix, out);
+        return 0;
+    }
+    nfft_nwt = mp_get_nfft_init(radix, nfft);
+    n_nwt = nfft_nwt + 2;
+    if (n_nwt > n) {
+        n_nwt = n;
+    }
+    mp_sqrt_init(n_nwt, radix, in, out, tmp1);
+    n_tmp1fft = 0;
+    thr = 8;
+    do {
+        n_nwt = nfft_nwt + 2;
+        if (n_nwt > n) {
+            n_nwt = n;
+        }
+        prc = mp_sqrt_newton(n_nwt, radix, in, out, 
+                tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft, 
+                ip, w, &n_tmp1fft);
+        if (thr * nfft_nwt >= nfft) {
+            thr = 0;
+            if (2 * prc <= n_nwt - 2) {
+                nfft_nwt >>= 1;
+            }
+        } else {
+            if (3 * prc < n_nwt - 2) {
+                nfft_nwt >>= 1;
+            }
+        }
+        nfft_nwt <<= 1;
+    } while (nfft_nwt <= nfft);
+    return 0;
+}
+
+
+/* -------- mp_inv child routines -------- */
+
+
+int mp_get_nfft_init(int radix, int nfft_max)
+{
+    int nfft_init;
+    double r;
+    
+    r = radix;
+    nfft_init = 1;
+    do {
+        r *= r;
+        nfft_init <<= 1;
+    } while (DBL_EPSILON * r < 1 && nfft_init < nfft_max);
+    return nfft_init;
+}
+
+
+void mp_inv_init(int n, int radix, int in[], int out[])
+{
+    void mp_unexp_d2mp(int n, int radix, double din, int out[]);
+    double mp_unexp_mp2d(int n, int radix, int in[]);
+    int outexp;
+    double din;
+    
+    out[0] = in[0];
+    outexp = -in[1];
+    din = 1.0 / mp_unexp_mp2d(n, radix, &in[2]);
+    while (din < 1) {
+        din *= radix;
+        outexp--;
+    }
+    out[1] = outexp;
+    mp_unexp_d2mp(n, radix, din, &out[2]);
+}
+
+
+void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[])
+{
+    void mp_unexp_d2mp(int n, int radix, double din, int out[]);
+    double mp_unexp_mp2d(int n, int radix, int in[]);
+    int outexp;
+    double din;
+    
+    out[0] = 1;
+    out_rev[0] = 1;
+    outexp = in[1];
+    din = mp_unexp_mp2d(n, radix, &in[2]);
+    if (outexp % 2 != 0) {
+        din *= radix;
+        outexp--;
+    }
+    outexp /= 2;
+    din = sqrt(din);
+    if (din < 1) {
+        din *= radix;
+        outexp--;
+    }
+    out[1] = outexp;
+    mp_unexp_d2mp(n, radix, din, &out[2]);
+    outexp = -outexp;
+    din = 1.0 / din;
+    while (din < 1) {
+        din *= radix;
+        outexp--;
+    }
+    out_rev[1] = outexp;
+    mp_unexp_d2mp(n, radix, din, &out_rev[2]);
+}
+
+
+void mp_unexp_d2mp(int n, int radix, double din, int out[])
+{
+    int j, x;
+    
+    for (j = 0; j < n; j++) {
+        x = (int) din;
+        if (x >= radix) {
+            x = radix - 1;
+            din = radix;
+        }
+        din = radix * (din - x);
+        out[j] = x;
+    }
+}
+
+
+double mp_unexp_mp2d(int n, int radix, int in[])
+{
+    int j;
+    double d1_radix, dout;
+    
+    d1_radix = 1.0 / radix;
+    dout = 0;
+    for (j = n - 1; j >= 0; j--) {
+        dout = d1_radix * dout + in[j];
+    }
+    return dout;
+}
+
+
+int mp_inv_newton(int n, int radix, int in[], int inout[], 
+        int tmp1[], int tmp2[], int nfft, double tmp1fft[], 
+        double tmp2fft[], int ip[], double w[])
+{
+    void mp_load_1(int n, int radix, int out[]);
+    void mp_round(int n, int radix, int m, int inout[]);
+    void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_mulh(int n, int radix, int in1[], int in2[], int out[], 
+            int nfft, double in1fft[], double outfft[], 
+            int ip[], double w[]);
+    void mp_mulh_use_in1fft(int n, int radix, double in1fft[], 
+            int shift, int in2[], int out[], int nfft, double outfft[], 
+            int ip[], double w[]);
+    int n_h, shift, prc;
+    
+    shift = (nfft >> 1) + 1;
+    n_h = n / 2 + 1;
+    if (n_h < n - shift) {
+        n_h = n - shift;
+    }
+    /* ---- tmp1 = inout * (upper) in (half to normal precision) ---- */
+    mp_round(n, radix, shift, inout);
+    mp_mulh(n, radix, inout, in, tmp1, 
+            nfft, tmp1fft, tmp2fft, ip, w);
+    /* ---- tmp2 = 1 - tmp1 ---- */
+    mp_load_1(n, radix, tmp2);
+    mp_sub(n, radix, tmp2, tmp1, tmp2);
+    /* ---- tmp2 -= inout * (lower) in (half precision) ---- */
+    mp_mulh_use_in1fft(n, radix, tmp1fft, shift, in, tmp1, 
+            nfft, tmp2fft, ip, w);
+    mp_sub(n_h, radix, tmp2, tmp1, tmp2);
+    /* ---- get precision ---- */
+    prc = -tmp2[1];
+    if (tmp2[0] == 0) {
+        prc = nfft + 1;
+    }
+    /* ---- tmp2 *= inout (half precision) ---- */
+    mp_mulh_use_in1fft(n_h, radix, tmp1fft, 0, tmp2, tmp2, 
+            nfft, tmp2fft, ip, w);
+    /* ---- inout += tmp2 ---- */
+    if (tmp2[0] != 0) {
+        mp_add(n, radix, inout, tmp2, inout);
+    }
+    return prc;
+}
+
+
+int mp_sqrt_newton(int n, int radix, int in[], int inout[], 
+        int inout_rev[], int tmp[], int nfft, double tmp1fft[], 
+        double tmp2fft[], int ip[], double w[], int *n_tmp1fft)
+{
+    void mp_round(int n, int radix, int m, int inout[]);
+    void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_idiv_2(int n, int radix, int in[], int out[]);
+    void mp_mulh(int n, int radix, int in1[], int in2[], int out[], 
+            int nfft, double in1fft[], double outfft[], 
+            int ip[], double w[]);
+    void mp_squh(int n, int radix, int in[], int out[], 
+            int nfft, double inoutfft[], int ip[], double w[]);
+    void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], 
+            int nfft, int ip[], double w[]);
+    int n_h, nfft_h, shift, prc;
+    
+    nfft_h = nfft >> 1;
+    shift = nfft_h + 1;
+    if (nfft_h < 2) {
+        nfft_h = 2;
+    }
+    n_h = n / 2 + 1;
+    if (n_h < n - shift) {
+        n_h = n - shift;
+    }
+    /* ---- tmp = inout_rev^2 (1/4 to half precision) ---- */
+    mp_round(n_h, radix, (nfft_h >> 1) + 1, inout_rev);
+    if (*n_tmp1fft != nfft_h) {
+        mp_squh(n_h, radix, inout_rev, tmp, 
+                nfft_h, tmp1fft, ip, w);
+    } else {
+        mp_squh_use_in1fft(n_h, radix, tmp1fft, tmp, 
+                nfft_h, ip, w);
+    }
+    /* ---- tmp = inout_rev - inout * tmp (half precision) ---- */
+    mp_round(n, radix, shift, inout);
+    mp_mulh(n_h, radix, inout, tmp, tmp, 
+            nfft, tmp1fft, tmp2fft, ip, w);
+    mp_sub(n_h, radix, inout_rev, tmp, tmp);
+    /* ---- inout_rev += tmp ---- */
+    mp_add(n_h, radix, inout_rev, tmp, inout_rev);
+    /* ---- tmp = in - inout^2 (half to normal precision) ---- */
+    mp_squh_use_in1fft(n, radix, tmp1fft, tmp, 
+            nfft, ip, w);
+    mp_sub(n, radix, in, tmp, tmp);
+    /* ---- get precision ---- */
+    prc = in[1] - tmp[1];
+    if (in[2] > tmp[2]) {
+        prc++;
+    }
+    if (tmp[0] == 0) {
+        prc = nfft + 1;
+    }
+    /* ---- tmp = tmp * inout_rev / 2 (half precision) ---- */
+    mp_round(n_h, radix, shift, inout_rev);
+    mp_mulh(n_h, radix, inout_rev, tmp, tmp, 
+            nfft, tmp1fft, tmp2fft, ip, w);
+    *n_tmp1fft = nfft;
+    mp_idiv_2(n_h, radix, tmp, tmp);
+    /* ---- inout += tmp ---- */
+    if (tmp[0] != 0) {
+        mp_add(n, radix, inout, tmp, inout);
+    }
+    return prc;
+}
+
+
+/* -------- mp_io routines -------- */
+
+
+void mp_sprintf(int n, int log10_radix, int in[], char out[])
+{
+    int j, k, x, y, outexp, shift;
+    
+    if (in[0] < 0) {
+        *out++ = '-';
+    }
+    x = in[2];
+    shift = log10_radix;
+    for (k = log10_radix; k > 0; k--) {
+        y = x % 10;
+        x /= 10;
+        out[k] = '0' + y;
+        if (y != 0) {
+            shift = k;
+        }
+    }
+    out[0] = out[shift];
+    out[1] = '.';
+    for (k = 1; k <= log10_radix - shift; k++) {
+        out[k + 1] = out[k + shift];
+    }
+    outexp = log10_radix - shift;
+    out += outexp + 2;
+    for (j = 3; j <= n + 1; j++) {
+        x = in[j];
+        for (k = log10_radix - 1; k >= 0; k--) {
+            y = x % 10;
+            x /= 10;
+            out[k] = '0' + y;
+        }
+        out += log10_radix;
+    }
+    *out++ = 'e';
+    outexp += log10_radix * in[1];
+    sprintf(out, "%d", outexp);
+}
+
+
+void mp_sscanf(int n, int log10_radix, char in[], int out[])
+{
+    char *s;
+    int j, x, outexp, outexp_mod;
+    
+    while (*in == ' ') {
+        in++;
+    }
+    out[0] = 1;
+    if (*in == '-') {
+        out[0] = -1;
+        in++;
+    } else if (*in == '+') {
+        in++;
+    }
+    while (*in == ' ' || *in == '0') {
+        in++;
+    }
+    outexp = 0;
+    for (s = in; *s != '\0'; s++) {
+        if (*s == 'e' || *s == 'E' || *s == 'd' || *s == 'D') {
+            if (sscanf(++s, "%d", &outexp) != 1) {
+                outexp = 0;
+            }
+            break;
+        }
+    }
+    if (*in == '.') {
+        do {
+            outexp--;
+            while (*++in == ' ');
+        } while (*in == '0' && *in != '\0');
+    } else if (*in != '\0') {
+        s = in;
+        while (*++s == ' ');
+        while (*s >= '0' && *s <= '9' && *s != '\0') {
+            outexp++;
+            while (*++s == ' ');
+        }
+    }
+    x = outexp / log10_radix;
+    outexp_mod = outexp - log10_radix * x;
+    if (outexp_mod < 0) {
+        x--;
+        outexp_mod += log10_radix;
+    }
+    out[1] = x;
+    x = 0;
+    j = 2;
+    for (s = in; *s != '\0'; s++) {
+        if (*s == '.' || *s == ' ') {
+            continue;
+        }
+        if (*s < '0' || *s > '9') {
+            break;
+        }
+        x = 10 * x + (*s - '0');
+        if (--outexp_mod < 0) {
+            if (j > n + 1) {
+                break;
+            }
+            out[j++] = x;
+            x = 0;
+            outexp_mod = log10_radix - 1;
+        }
+    }
+    while (outexp_mod-- >= 0) {
+        x *= 10;
+    }
+    while (j <= n + 1) {
+        out[j++] = x;
+        x = 0;
+    }
+    if (out[2] == 0) {
+        out[0] = 0;
+        out[1] = 0;
+    }
+}
+
+
+void mp_fprintf(int n, int log10_radix, int in[], FILE *fout)
+{
+    int j, k, x, y, outexp, shift;
+    char out[256];
+    
+    if (in[0] < 0) {
+        putc('-', fout);
+    }
+    x = in[2];
+    shift = log10_radix;
+    for (k = log10_radix; k > 0; k--) {
+        y = x % 10;
+        x /= 10;
+        out[k] = '0' + y;
+        if (y != 0) {
+            shift = k;
+        }
+    }
+    putc(out[shift], fout);
+    putc('.', fout);
+    for (k = 1; k <= log10_radix - shift; k++) {
+        putc(out[k + shift], fout);
+    }
+    outexp = log10_radix - shift;
+    for (j = 3; j <= n + 1; j++) {
+        x = in[j];
+        for (k = log10_radix - 1; k >= 0; k--) {
+            y = x % 10;
+            x /= 10;
+            out[k] = '0' + y;
+        }
+        for (k = 0; k < log10_radix; k++) {
+            putc(out[k], fout);
+        }
+    }
+    putc('e', fout);
+    outexp += log10_radix * in[1];
+    sprintf(out, "%d", outexp);
+    for (k = 0; out[k] != '\0'; k++) {
+        putc(out[k], fout);
+    }
+}
+
+
diff --git a/sample2d/Makefile.f77 b/sample2d/Makefile.f77
new file mode 100644
index 0000000..0162fdc
--- /dev/null
+++ b/sample2d/Makefile.f77
@@ -0,0 +1,60 @@
+# ---- for GNU g77 ----
+
+F77 = g77
+
+FFLAGS = -Wall
+
+OFLAGS = -O2
+
+# ---- for SUN WS f77 ----
+#
+#F77 = f77
+#
+#FFLAGS = 
+#
+#OFLAGS = -xO2
+
+
+
+
+all: fft4f2dt_f fftsg2dt_f fftsg3dt_f
+
+
+fft4f2dt_f : fft4f2dt_f.o fft4f2d_f.o
+	$(F77) fft4f2dt_f.o fft4f2d_f.o -o fft4f2dt_f
+
+fftsg2dt_f : fftsg2dt_f.o fftsg2d_f.o fftsg_f.o
+	$(F77) fftsg2dt_f.o fftsg2d_f.o fftsg_f.o -o fftsg2dt_f
+
+fftsg3dt_f : fftsg3dt_f.o fftsg3d_f.o fftsg_f.o
+	$(F77) fftsg3dt_f.o fftsg3d_f.o fftsg_f.o -o fftsg3dt_f
+
+
+fft4f2dt_f.o : fft4f2dt.f
+	$(F77) $(FFLAGS) $(OFLAGS) -c fft4f2dt.f -o fft4f2dt_f.o
+
+fftsg2dt_f.o : fftsg2dt.f
+	$(F77) $(FFLAGS) $(OFLAGS) -c fftsg2dt.f -o fftsg2dt_f.o
+
+fftsg3dt_f.o : fftsg3dt.f
+	$(F77) $(FFLAGS) $(OFLAGS) -c fftsg3dt.f -o fftsg3dt_f.o
+
+
+fft4f2d_f.o : ../fft4f2d.f
+	$(F77) $(FFLAGS) $(OFLAGS) -c ../fft4f2d.f -o fft4f2d_f.o
+
+fftsg2d_f.o : ../fftsg2d.f
+	$(F77) $(FFLAGS) $(OFLAGS) -c ../fftsg2d.f -o fftsg2d_f.o
+
+fftsg3d_f.o : ../fftsg3d.f
+	$(F77) $(FFLAGS) $(OFLAGS) -c ../fftsg3d.f -o fftsg3d_f.o
+
+fftsg_f.o : ../fftsg.f
+	$(F77) $(FFLAGS) $(OFLAGS) -c ../fftsg.f -o fftsg_f.o
+
+
+
+
+clean:
+	rm -f *.o
+
diff --git a/sample2d/Makefile.pth b/sample2d/Makefile.pth
new file mode 100644
index 0000000..d46941e
--- /dev/null
+++ b/sample2d/Makefile.pth
@@ -0,0 +1,54 @@
+# ---- for GNU gcc ----
+
+CC = gcc
+
+CFLAGS = -Wall -DUSE_FFT2D_PTHREADS -DUSE_FFT3D_PTHREADS
+
+OFLAGS = -O2
+
+# ---- for SUN WS cc ----
+#
+#CC = cc
+#
+#CFLAGS = -Wall -DUSE_FFT2D_PTHREADS -DUSE_FFT3D_PTHREADS
+#
+#OFLAGS = -xO2
+
+
+
+
+all: fftsg2dt_pt fftsg3dt_pt
+
+
+fftsg2dt_pt : fftsg2dt.o fftsg2dpt.o fftsg.o alloc.o
+	$(CC) fftsg2dt.o fftsg2dpt.o fftsg.o alloc.o -lm -lpthread -o fftsg2dt_pt
+
+fftsg3dt_pt : fftsg3dt.o fftsg3dpt.o fftsg.o alloc.o
+	$(CC) fftsg3dt.o fftsg3dpt.o fftsg.o alloc.o -lm -lpthread -o fftsg3dt_pt
+
+
+fftsg2dt.o : fftsg2dt.c
+	$(CC) $(CFLAGS) $(OFLAGS) -c fftsg2dt.c -o fftsg2dt.o
+
+fftsg3dt.o : fftsg3dt.c
+	$(CC) $(CFLAGS) $(OFLAGS) -c fftsg3dt.c -o fftsg3dt.o
+
+
+fftsg2dpt.o : ../fftsg2d.c
+	$(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg2d.c -o fftsg2dpt.o
+
+fftsg3dpt.o : ../fftsg3d.c
+	$(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg3d.c -o fftsg3dpt.o
+
+fftsg.o : ../fftsg.c
+	$(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg.c -o fftsg.o
+
+alloc.o : ../alloc.c
+	$(CC) $(CFLAGS) $(OFLAGS) -c ../alloc.c -o alloc.o
+
+
+
+
+clean:
+	rm -f *.o
+
diff --git a/sample2d/alloc.h b/sample2d/alloc.h
new file mode 100644
index 0000000..3467cc4
--- /dev/null
+++ b/sample2d/alloc.h
@@ -0,0 +1,20 @@
+/* ---- memory allocation ---- */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+
+
+int *alloc_1d_int(int n1);
+void free_1d_int(int *i);
+double *alloc_1d_double(int n1);
+void free_1d_double(double *d);
+int **alloc_2d_int(int n1, int n2);
+void free_2d_int(int **ii);
+double **alloc_2d_double(int n1, int n2);
+void free_2d_double(double **dd);
+int ***alloc_3d_int(int n1, int n2, int n3);
+void free_3d_int(int ***iii);
+double ***alloc_3d_double(int n1, int n2, int n3);
+void free_3d_double(double ***ddd);
+
diff --git a/sample2d/fft4f2dt.c b/sample2d/fft4f2dt.c
new file mode 100644
index 0000000..2f300fd
--- /dev/null
+++ b/sample2d/fft4f2dt.c
@@ -0,0 +1,109 @@
+/* test of fft4f2d.c */
+
+#include <math.h>
+#include <stdio.h>
+#include "alloc.h"
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+/* random number generator, 0 <= RND < 1 */
+#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
+
+
+int main()
+{
+    void cdft2d(int, int, int, double **, int *, double *);
+    void rdft2d(int, int, int, double **, int *, double *);
+    void ddct2d(int, int, int, double **, double **, int *, double *);
+    void ddst2d(int, int, int, double **, double **, int *, double *);
+    void putdata2d(int n1, int n2, double **a);
+    double errorcheck2d(int n1, int n2, double scale, double **a);
+    int *ip, n1, n2, n, i;
+    double **a, **t, *w, err;
+
+    printf("data length n1=? (n1 = power of 2) \n");
+    scanf("%d", &n1);
+    printf("data length n2=? (n2 = power of 2) \n");
+    scanf("%d", &n2);
+
+    a = alloc_2d_double(n1, n2);
+    t = alloc_2d_double(n1, n2);
+    n = MAX(n1, n2 / 2);
+    ip = alloc_1d_int(2 + (int) sqrt(n + 0.5));
+    n = MAX(n1 / 2, n2 / 4) + MAX(n1, n2);
+    w = alloc_1d_double(n);
+    ip[0] = 0;
+
+    /* check of CDFT */
+    putdata2d(n1, n2, a);
+    cdft2d(n1, n2, 1, a, ip, w);
+    cdft2d(n1, n2, -1, a, ip, w);
+    err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
+    printf("cdft2d err= %g \n", err);
+
+    /* check of RDFT */
+    putdata2d(n1, n2, a);
+    rdft2d(n1, n2, 1, a, ip, w);
+    rdft2d(n1, n2, -1, a, ip, w);
+    err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
+    printf("rdft2d err= %g \n", err);
+
+    /* check of DDCT */
+    putdata2d(n1, n2, a);
+    ddct2d(n1, n2, 1, a, t, ip, w);
+    ddct2d(n1, n2, -1, a, t, ip, w);
+    for (i = 0; i <= n1 - 1; i++) {
+        a[i][0] *= 0.5;
+    }
+    for (i = 0; i <= n2 - 1; i++) {
+        a[0][i] *= 0.5;
+    }
+    err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
+    printf("ddct2d err= %g \n", err);
+
+    /* check of DDST */
+    putdata2d(n1, n2, a);
+    ddst2d(n1, n2, 1, a, t, ip, w);
+    ddst2d(n1, n2, -1, a, t, ip, w);
+    for (i = 0; i <= n1 - 1; i++) {
+        a[i][0] *= 0.5;
+    }
+    for (i = 0; i <= n2 - 1; i++) {
+        a[0][i] *= 0.5;
+    }
+    err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
+    printf("ddst2d err= %g \n", err);
+
+    free_1d_double(w);
+    free_1d_int(ip);
+    free_2d_double(t);
+    free_2d_double(a);
+    return 0;
+}
+
+
+void putdata2d(int n1, int n2, double **a)
+{
+    int j1, j2, seed = 0;
+
+    for (j1 = 0; j1 <= n1 - 1; j1++) {
+        for (j2 = 0; j2 <= n2 - 1; j2++) {
+            a[j1][j2] = RND(&seed);
+        }
+    }
+}
+
+
+double errorcheck2d(int n1, int n2, double scale, double **a)
+{
+    int j1, j2, seed = 0;
+    double err = 0, e;
+
+    for (j1 = 0; j1 <= n1 - 1; j1++) {
+        for (j2 = 0; j2 <= n2 - 1; j2++) {
+            e = RND(&seed) - a[j1][j2] * scale;
+            err = MAX(err, fabs(e));
+        }
+    }
+    return err;
+}
+
diff --git a/sample2d/fft4f2dt.f b/sample2d/fft4f2dt.f
new file mode 100644
index 0000000..43ac8ac
--- /dev/null
+++ b/sample2d/fft4f2dt.f
@@ -0,0 +1,95 @@
+! test of fft4f2d.f
+!
+      program main
+      integer nmax, nmaxsqrt
+      parameter (nmax = 1024)
+      parameter (nmaxsqrt = 32)
+      integer ip(0 : nmaxsqrt + 1), n1, n2, i
+      real*8 a(0 : nmax - 1, 0 : nmax - 1), 
+     &    t(0 : nmax - 1, 0 : nmax - 1), w(0 : nmax * 3 / 2 - 1), 
+     &    err, errorcheck2d
+!
+      write (*, *) 'data length n1=? (n1 = power of 2) '
+      read (*, *) n1
+      write (*, *) 'data length n2=? (n2 = power of 2) '
+      read (*, *) n2
+      ip(0) = 0
+!
+!   check of CDFT
+      call putdata2d(nmax, n1, n2, a)
+      call cdft2d(nmax, n1, n2, 1, a, ip, w)
+      call cdft2d(nmax, n1, n2, -1, a, ip, w)
+      err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
+      write (*, *) 'cdft2d err= ', err
+!
+!   check of RDFT
+      call putdata2d(nmax, n1, n2, a)
+      call rdft2d(nmax, n1, n2, 1, a, ip, w)
+      call rdft2d(nmax, n1, n2, -1, a, ip, w)
+      err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
+      write (*, *) 'rdft2d err= ', err
+!
+!   check of DDCT
+      call putdata2d(nmax, n1, n2, a)
+      call ddct2d(nmax, n1, n2, 1, a, t, ip, w)
+      call ddct2d(nmax, n1, n2, -1, a, t, ip, w)
+      do i = 0, n1 - 1
+          a(i, 0) = a(i, 0) * 0.5d0
+      end do
+      do i = 0, n2 - 1
+          a(0, i) = a(0, i) * 0.5d0
+      end do
+      err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
+      write (*, *) 'ddct2d err= ', err
+!
+!   check of DDST
+      call putdata2d(nmax, n1, n2, a)
+      call ddst2d(nmax, n1, n2, 1, a, t, ip, w)
+      call ddst2d(nmax, n1, n2, -1, a, t, ip, w)
+      do i = 0, n1 - 1
+          a(i, 0) = a(i, 0) * 0.5d0
+      end do
+      do i = 0, n2 - 1
+          a(0, i) = a(0, i) * 0.5d0
+      end do
+      err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
+      write (*, *) 'ddst2d err= ', err
+!
+      end
+!
+!
+      subroutine putdata2d(n1max, n1, n2, a)
+      integer n1max, n1, n2, j1, j2, seed
+      real*8 a(0 : n1max - 1, 0 : *), drnd
+      seed = 0
+      do j2 = 0, n2 - 1
+          do j1 = 0, n1 - 1
+              a(j1, j2) = drnd(seed)
+          end do
+      end do
+      end
+!
+!
+      function errorcheck2d(n1max, n1, n2, scale, a)
+      integer n1max, n1, n2, j1, j2, seed
+      real*8 scale, a(0 : n1max - 1, 0 : *), drnd, err, e, 
+     &    errorcheck2d
+      err = 0
+      seed = 0
+      do j2 = 0, n2 - 1
+          do j1 = 0, n1 - 1
+              e = drnd(seed) - a(j1, j2) * scale
+              err = max(err, abs(e))
+          end do
+      end do
+      errorcheck2d = err
+      end
+!
+!
+! random number generator, 0 <= drnd < 1
+      real*8 function drnd(seed)
+      integer seed
+      seed = mod(seed * 7141 + 54773, 259200)
+      drnd = seed * (1.0d0 / 259200)
+      end
+!
diff --git a/sample2d/fftsg2dt.c b/sample2d/fftsg2dt.c
new file mode 100644
index 0000000..c4cf935
--- /dev/null
+++ b/sample2d/fftsg2dt.c
@@ -0,0 +1,107 @@
+/* test of fftsg2d.c */
+
+#include <math.h>
+#include <stdio.h>
+#include "alloc.h"
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+/* random number generator, 0 <= RND < 1 */
+#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
+
+
+int main()
+{
+    void cdft2d(int, int, int, double **, double *, int *, double *);
+    void rdft2d(int, int, int, double **, double *, int *, double *);
+    void ddct2d(int, int, int, double **, double *, int *, double *);
+    void ddst2d(int, int, int, double **, double *, int *, double *);
+    void putdata2d(int n1, int n2, double **a);
+    double errorcheck2d(int n1, int n2, double scale, double **a);
+    int *ip, n1, n2, n, i;
+    double **a, *w, err;
+
+    printf("data length n1=? (n1 = power of 2) \n");
+    scanf("%d", &n1);
+    printf("data length n2=? (n2 = power of 2) \n");
+    scanf("%d", &n2);
+
+    a = alloc_2d_double(n1, n2);
+    n = MAX(n1, n2 / 2);
+    ip = alloc_1d_int(2 + (int) sqrt(n + 0.5));
+    n = MAX(n1, n2) * 3 / 2;
+    w = alloc_1d_double(n);
+    ip[0] = 0;
+
+    /* check of CDFT */
+    putdata2d(n1, n2, a);
+    cdft2d(n1, n2, 1, a, NULL, ip, w);
+    cdft2d(n1, n2, -1, a, NULL, ip, w);
+    err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
+    printf("cdft2d err= %g \n", err);
+
+    /* check of RDFT */
+    putdata2d(n1, n2, a);
+    rdft2d(n1, n2, 1, a, NULL, ip, w);
+    rdft2d(n1, n2, -1, a, NULL, ip, w);
+    err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
+    printf("rdft2d err= %g \n", err);
+
+    /* check of DDCT */
+    putdata2d(n1, n2, a);
+    ddct2d(n1, n2, 1, a, NULL, ip, w);
+    ddct2d(n1, n2, -1, a, NULL, ip, w);
+    for (i = 0; i <= n1 - 1; i++) {
+        a[i][0] *= 0.5;
+    }
+    for (i = 0; i <= n2 - 1; i++) {
+        a[0][i] *= 0.5;
+    }
+    err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
+    printf("ddct2d err= %g \n", err);
+
+    /* check of DDST */
+    putdata2d(n1, n2, a);
+    ddst2d(n1, n2, 1, a, NULL, ip, w);
+    ddst2d(n1, n2, -1, a, NULL, ip, w);
+    for (i = 0; i <= n1 - 1; i++) {
+        a[i][0] *= 0.5;
+    }
+    for (i = 0; i <= n2 - 1; i++) {
+        a[0][i] *= 0.5;
+    }
+    err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
+    printf("ddst2d err= %g \n", err);
+
+    free_1d_double(w);
+    free_1d_int(ip);
+    free_2d_double(a);
+    return 0;
+}
+
+
+void putdata2d(int n1, int n2, double **a)
+{
+    int j1, j2, seed = 0;
+
+    for (j1 = 0; j1 <= n1 - 1; j1++) {
+        for (j2 = 0; j2 <= n2 - 1; j2++) {
+            a[j1][j2] = RND(&seed);
+        }
+    }
+}
+
+
+double errorcheck2d(int n1, int n2, double scale, double **a)
+{
+    int j1, j2, seed = 0;
+    double err = 0, e;
+
+    for (j1 = 0; j1 <= n1 - 1; j1++) {
+        for (j2 = 0; j2 <= n2 - 1; j2++) {
+            e = RND(&seed) - a[j1][j2] * scale;
+            err = MAX(err, fabs(e));
+        }
+    }
+    return err;
+}
+
diff --git a/sample2d/fftsg2dt.f b/sample2d/fftsg2dt.f
new file mode 100644
index 0000000..a375c05
--- /dev/null
+++ b/sample2d/fftsg2dt.f
@@ -0,0 +1,94 @@
+! test of fftsg2d.f
+!
+      program main
+      integer nmax, nmaxsqrt
+      parameter (nmax = 1024)
+      parameter (nmaxsqrt = 32)
+      integer ip(0 : nmaxsqrt + 1), n1, n2, i
+      real*8 a(0 : nmax - 1, 0 : nmax - 1), t(0 : 8 * nmax - 1), 
+     &    w(0 : nmax * 3 / 2 - 1), err, errorcheck2d
+!
+      write (*, *) 'data length n1=? (n1 = power of 2) '
+      read (*, *) n1
+      write (*, *) 'data length n2=? (n2 = power of 2) '
+      read (*, *) n2
+      ip(0) = 0
+!
+!   check of CDFT
+      call putdata2d(nmax, n1, n2, a)
+      call cdft2d(nmax, n1, n2, 1, a, t, ip, w)
+      call cdft2d(nmax, n1, n2, -1, a, t, ip, w)
+      err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
+      write (*, *) 'cdft2d err= ', err
+!
+!   check of RDFT
+      call putdata2d(nmax, n1, n2, a)
+      call rdft2d(nmax, n1, n2, 1, a, t, ip, w)
+      call rdft2d(nmax, n1, n2, -1, a, t, ip, w)
+      err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
+      write (*, *) 'rdft2d err= ', err
+!
+!   check of DDCT
+      call putdata2d(nmax, n1, n2, a)
+      call ddct2d(nmax, n1, n2, 1, a, t, ip, w)
+      call ddct2d(nmax, n1, n2, -1, a, t, ip, w)
+      do i = 0, n1 - 1
+          a(i, 0) = a(i, 0) * 0.5d0
+      end do
+      do i = 0, n2 - 1
+          a(0, i) = a(0, i) * 0.5d0
+      end do
+      err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
+      write (*, *) 'ddct2d err= ', err
+!
+!   check of DDST
+      call putdata2d(nmax, n1, n2, a)
+      call ddst2d(nmax, n1, n2, 1, a, t, ip, w)
+      call ddst2d(nmax, n1, n2, -1, a, t, ip, w)
+      do i = 0, n1 - 1
+          a(i, 0) = a(i, 0) * 0.5d0
+      end do
+      do i = 0, n2 - 1
+          a(0, i) = a(0, i) * 0.5d0
+      end do
+      err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
+      write (*, *) 'ddst2d err= ', err
+!
+      end
+!
+!
+      subroutine putdata2d(n1max, n1, n2, a)
+      integer n1max, n1, n2, j1, j2, seed
+      real*8 a(0 : n1max - 1, 0 : *), drnd
+      seed = 0
+      do j2 = 0, n2 - 1
+          do j1 = 0, n1 - 1
+              a(j1, j2) = drnd(seed)
+          end do
+      end do
+      end
+!
+!
+      function errorcheck2d(n1max, n1, n2, scale, a)
+      integer n1max, n1, n2, j1, j2, seed
+      real*8 scale, a(0 : n1max - 1, 0 : *), drnd, err, e, 
+     &    errorcheck2d
+      err = 0
+      seed = 0
+      do j2 = 0, n2 - 1
+          do j1 = 0, n1 - 1
+              e = drnd(seed) - a(j1, j2) * scale
+              err = max(err, abs(e))
+          end do
+      end do
+      errorcheck2d = err
+      end
+!
+!
+! random number generator, 0 <= drnd < 1
+      real*8 function drnd(seed)
+      integer seed
+      seed = mod(seed * 7141 + 54773, 259200)
+      drnd = seed * (1.0d0 / 259200)
+      end
+!
diff --git a/sample2d/fftsg3dt.c b/sample2d/fftsg3dt.c
new file mode 100644
index 0000000..87879e6
--- /dev/null
+++ b/sample2d/fftsg3dt.c
@@ -0,0 +1,128 @@
+/* test of fftsg3d.c */
+
+#include <math.h>
+#include <stdio.h>
+#include "alloc.h"
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+/* random number generator, 0 <= RND < 1 */
+#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
+
+
+int main()
+{
+    void cdft3d(int, int, int, int, double ***, double *, int *, double *);
+    void rdft3d(int, int, int, int, double ***, double *, int *, double *);
+    void ddct3d(int, int, int, int, double ***, double *, int *, double *);
+    void ddst3d(int, int, int, int, double ***, double *, int *, double *);
+    void putdata3d(int n1, int n2, int n3, double ***a);
+    double errorcheck3d(int n1, int n2, int n3, double scale, double ***a);
+    int *ip, n1, n2, n3, n, nt, i, j;
+    double ***a, *w, err;
+
+    printf("data length n1=? (n1 = power of 2) \n");
+    scanf("%d", &n1);
+    printf("data length n2=? (n2 = power of 2) \n");
+    scanf("%d", &n2);
+    printf("data length n3=? (n3 = power of 2) \n");
+    scanf("%d", &n3);
+
+    a = alloc_3d_double(n1, n2, n3);
+    nt = MAX(n1, n2);
+    n = MAX(nt, n3 / 2);
+    ip = alloc_1d_int(2 + (int) sqrt(n + 0.5));
+    n = MAX(nt, n3) * 3 / 2;
+    w = alloc_1d_double(n);
+    ip[0] = 0;
+
+    /* check of CDFT */
+    putdata3d(n1, n2, n3, a);
+    cdft3d(n1, n2, n3, 1, a, NULL, ip, w);
+    cdft3d(n1, n2, n3, -1, a, NULL, ip, w);
+    err = errorcheck3d(n1, n2, n3, 2.0 / n1 / n2 / n3, a);
+    printf("cdft3d err= %g \n", err);
+
+    /* check of RDFT */
+    putdata3d(n1, n2, n3, a);
+    rdft3d(n1, n2, n3, 1, a, NULL, ip, w);
+    rdft3d(n1, n2, n3, -1, a, NULL, ip, w);
+    err = errorcheck3d(n1, n2, n3, 2.0 / n1 / n2 / n3, a);
+    printf("rdft3d err= %g \n", err);
+
+    /* check of DDCT */
+    putdata3d(n1, n2, n3, a);
+    ddct3d(n1, n2, n3, 1, a, NULL, ip, w);
+    ddct3d(n1, n2, n3, -1, a, NULL, ip, w);
+    for (i = 0; i <= n1 - 1; i++) {
+        for (j = 0; j <= n2 - 1; j++) {
+            a[i][j][0] *= 0.5;
+        }
+        for (j = 0; j <= n3 - 1; j++) {
+            a[i][0][j] *= 0.5;
+        }
+    }
+    for (i = 0; i <= n2 - 1; i++) {
+        for (j = 0; j <= n3 - 1; j++) {
+            a[0][i][j] *= 0.5;
+        }
+    }
+    err = errorcheck3d(n1, n2, n3, 8.0 / n1 / n2 / n3, a);
+    printf("ddct3d err= %g \n", err);
+
+    /* check of DDST */
+    putdata3d(n1, n2, n3, a);
+    ddst3d(n1, n2, n3, 1, a, NULL, ip, w);
+    ddst3d(n1, n2, n3, -1, a, NULL, ip, w);
+    for (i = 0; i <= n1 - 1; i++) {
+        for (j = 0; j <= n2 - 1; j++) {
+            a[i][j][0] *= 0.5;
+        }
+        for (j = 0; j <= n3 - 1; j++) {
+            a[i][0][j] *= 0.5;
+        }
+    }
+    for (i = 0; i <= n2 - 1; i++) {
+        for (j = 0; j <= n3 - 1; j++) {
+            a[0][i][j] *= 0.5;
+        }
+    }
+    err = errorcheck3d(n1, n2, n3, 8.0 / n1 / n2 / n3, a);
+    printf("ddst3d err= %g \n", err);
+
+    free_1d_double(w);
+    free_1d_int(ip);
+    free_3d_double(a);
+    return 0;
+}
+
+
+void putdata3d(int n1, int n2, int n3, double ***a)
+{
+    int j1, j2, j3, seed = 0;
+
+    for (j1 = 0; j1 <= n1 - 1; j1++) {
+        for (j2 = 0; j2 <= n2 - 1; j2++) {
+            for (j3 = 0; j3 <= n3 - 1; j3++) {
+                a[j1][j2][j3] = RND(&seed);
+            }
+        }
+    }
+}
+
+
+double errorcheck3d(int n1, int n2, int n3, double scale, double ***a)
+{
+    int j1, j2, j3, seed = 0;
+    double err = 0, e;
+
+    for (j1 = 0; j1 <= n1 - 1; j1++) {
+        for (j2 = 0; j2 <= n2 - 1; j2++) {
+            for (j3 = 0; j3 <= n3 - 1; j3++) {
+                e = RND(&seed) - a[j1][j2][j3] * scale;
+                err = MAX(err, fabs(e));
+            }
+        }
+    }
+    return err;
+}
+
diff --git a/sample2d/fftsg3dt.f b/sample2d/fftsg3dt.f
new file mode 100644
index 0000000..bbada0d
--- /dev/null
+++ b/sample2d/fftsg3dt.f
@@ -0,0 +1,119 @@
+! test of fftsg3d.f
+!
+      program main
+      integer nmax, nmaxsqrt
+      parameter (nmax = 128)
+      parameter (nmaxsqrt = 16)
+      integer ip(0 : nmaxsqrt + 1), n1, n2, n3, i, j
+      real*8 a(0 : nmax - 1, 0 : nmax - 1, 0 : nmax - 1), 
+     &    t(0 : 8 * nmax - 1), 
+     &    w(0 : nmax * 3 / 2 - 1), err, errorcheck3d
+!
+      write (*, *) 'data length n1=? (n1 = power of 2) '
+      read (*, *) n1
+      write (*, *) 'data length n2=? (n2 = power of 2) '
+      read (*, *) n2
+      write (*, *) 'data length n3=? (n3 = power of 2) '
+      read (*, *) n3
+      ip(0) = 0
+!
+!   check of CDFT
+      call putdata3d(nmax, nmax, n1, n2, n3, a)
+      call cdft3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
+      call cdft3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
+      err = errorcheck3d(nmax, nmax, n1, n2, n3, 
+     &    2.0d0 / n1 / n2 / n3, a)
+      write (*, *) 'cdft3d err= ', err
+!
+!   check of RDFT
+      call putdata3d(nmax, nmax, n1, n2, n3, a)
+      call rdft3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
+      call rdft3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
+      err = errorcheck3d(nmax, nmax, n1, n2, n3, 
+     &    2.0d0 / n1 / n2 / n3, a)
+      write (*, *) 'rdft3d err= ', err
+!
+!   check of DDCT
+      call putdata3d(nmax, nmax, n1, n2, n3, a)
+      call ddct3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
+      call ddct3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
+      do j = 0, n2 - 1
+          do i = 0, n1 - 1
+              a(i, j, 0) = a(i, j, 0) * 0.5d0
+          end do
+      end do
+      do j = 0, n3 - 1
+          do i = 0, n1 - 1
+              a(i, 0, j) = a(i, 0, j) * 0.5d0
+          end do
+          do i = 0, n2 - 1
+              a(0, i, j) = a(0, i, j) * 0.5d0
+          end do
+      end do
+      err = errorcheck3d(nmax, nmax, n1, n2, n3, 
+     &    8.0d0 / n1 / n2 / n3, a)
+      write (*, *) 'ddct3d err= ', err
+!
+!   check of DDST
+      call putdata3d(nmax, nmax, n1, n2, n3, a)
+      call ddst3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
+      call ddst3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
+      do j = 0, n2 - 1
+          do i = 0, n1 - 1
+              a(i, j, 0) = a(i, j, 0) * 0.5d0
+          end do
+      end do
+      do j = 0, n3 - 1
+          do i = 0, n1 - 1
+              a(i, 0, j) = a(i, 0, j) * 0.5d0
+          end do
+          do i = 0, n2 - 1
+              a(0, i, j) = a(0, i, j) * 0.5d0
+          end do
+      end do
+      err = errorcheck3d(nmax, nmax, n1, n2, n3, 
+     &    8.0d0 / n1 / n2 / n3, a)
+      write (*, *) 'ddst3d err= ', err
+!
+      end
+!
+!
+      subroutine putdata3d(n1max, n2max, n1, n2, n3, a)
+      integer n1max, n2max, n1, n2, n3, j1, j2, j3, seed
+      real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : *), drnd
+      seed = 0
+      do j3 = 0, n3 - 1
+          do j2 = 0, n2 - 1
+              do j1 = 0, n1 - 1
+                  a(j1, j2, j3) = drnd(seed)
+              end do
+          end do
+      end do
+      end
+!
+!
+      function errorcheck3d(n1max, n2max, n1, n2, n3, scale, a)
+      integer n1max, n2max, n1, n2, n3, j1, j2, j3, seed
+      real*8 scale, a(0 : n1max - 1, 0 : n2max - 1, 0 : *), 
+     &    drnd, err, e, errorcheck3d
+      err = 0
+      seed = 0
+      do j3 = 0, n3 - 1
+          do j2 = 0, n2 - 1
+              do j1 = 0, n1 - 1
+                  e = drnd(seed) - a(j1, j2, j3) * scale
+                  err = max(err, abs(e))
+              end do
+          end do
+      end do
+      errorcheck3d = err
+      end
+!
+!
+! random number generator, 0 <= drnd < 1
+      real*8 function drnd(seed)
+      integer seed
+      seed = mod(seed * 7141 + 54773, 259200)
+      drnd = seed * (1.0d0 / 259200)
+      end
+!
diff --git a/sample2d/shrtdctt.c b/sample2d/shrtdctt.c
new file mode 100644
index 0000000..70d665f
--- /dev/null
+++ b/sample2d/shrtdctt.c
@@ -0,0 +1,68 @@
+/* test of shrtdct.c */
+
+#include <math.h>
+#include <stdio.h>
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+/* random number generator, 0 <= RND < 1 */
+#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
+
+#define NMAX 16
+
+int main()
+{
+    void ddct8x8s(int isgn, double **a);
+    void ddct16x16s(int isgn, double **a);
+    void putdata2d(int n1, int n2, double **a);
+    double errorcheck2d(int n1, int n2, double scale, double **a);
+    double err;
+    
+    int i;
+    double aarr[NMAX][NMAX], *a[NMAX], barr[NMAX][NMAX], *b[NMAX];
+    for (i = 0; i < NMAX; i++) a[i] = aarr[i];
+    for (i = 0; i < NMAX; i++) b[i] = barr[i];
+
+    /* check of 8x8 DCT */
+    putdata2d(8, 8, a);
+    ddct8x8s(-1, a);
+    ddct8x8s(1, a);
+    err = errorcheck2d(8, 8, 1.0, a);
+    printf("ddct8x8s   err= %g\n", err);
+
+    /* check of 16x16 DCT */
+    putdata2d(16, 16, a);
+    ddct16x16s(-1, a);
+    ddct16x16s(1, a);
+    err = errorcheck2d(16, 16, 1.0, a);
+    printf("ddct16x16s err= %g\n", err);
+
+    return 0;
+}
+
+
+void putdata2d(int n1, int n2, double **a)
+{
+    int j1, j2, seed = 0;
+
+    for (j1 = 0; j1 <= n1 - 1; j1++) {
+        for (j2 = 0; j2 <= n2 - 1; j2++) {
+            a[j1][j2] = RND(&seed);
+        }
+    }
+}
+
+
+double errorcheck2d(int n1, int n2, double scale, double **a)
+{
+    int j1, j2, seed = 0;
+    double err = 0, e;
+
+    for (j1 = 0; j1 <= n1 - 1; j1++) {
+        for (j2 = 0; j2 <= n2 - 1; j2++) {
+            e = RND(&seed) - a[j1][j2] * scale;
+            err = MAX(err, fabs(e));
+        }
+    }
+    return err;
+}
+
diff --git a/shrtdct.c b/shrtdct.c
new file mode 100644
index 0000000..455cb4c
--- /dev/null
+++ b/shrtdct.c
@@ -0,0 +1,538 @@
+/*
+Short Discrete Cosine Transform
+    data length :8x8, 16x16
+    method      :row-column, radix 4 FFT
+functions
+    ddct8x8s  : 8x8 DCT
+    ddct16x16s: 16x16 DCT
+function prototypes
+    void ddct8x8s(int isgn, double **a);
+    void ddct16x16s(int isgn, double **a);
+*/
+
+
+/*
+-------- 8x8 DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> Normalized 8x8 IDCT
+            C[k1][k2] = (1/4) * sum_j1=0^7 sum_j2=0^7 
+                            a[j1][j2] * s[j1] * s[j2] * 
+                            cos(pi*j1*(k1+1/2)/8) * 
+                            cos(pi*j2*(k2+1/2)/8), 0<=k1<8, 0<=k2<8
+                            (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
+        <case2> Normalized 8x8 DCT
+            C[k1][k2] = (1/4) * s[k1] * s[k2] * sum_j1=0^7 sum_j2=0^7 
+                            a[j1][j2] * 
+                            cos(pi*(j1+1/2)*k1/8) * 
+                            cos(pi*(j2+1/2)*k2/8), 0<=k1<8, 0<=k2<8
+                            (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
+    [usage]
+        <case1>
+            ddct8x8s(1, a);
+        <case2>
+            ddct8x8s(-1, a);
+    [parameters]
+        a[0...7][0...7] :input/output data (double **)
+                         output data
+                             a[k1][k2] = C[k1][k2], 0<=k1<8, 0<=k2<8
+*/
+
+
+/* Cn_kR = sqrt(2.0/n) * cos(pi/2*k/n) */
+/* Cn_kI = sqrt(2.0/n) * sin(pi/2*k/n) */
+/* Wn_kR = cos(pi/2*k/n) */
+/* Wn_kI = sin(pi/2*k/n) */
+#define C8_1R   0.49039264020161522456
+#define C8_1I   0.09754516100806413392
+#define C8_2R   0.46193976625564337806
+#define C8_2I   0.19134171618254488586
+#define C8_3R   0.41573480615127261854
+#define C8_3I   0.27778511650980111237
+#define C8_4R   0.35355339059327376220
+#define W8_4R   0.70710678118654752440
+
+
+void ddct8x8s(int isgn, double **a)
+{
+    int j;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    double xr, xi;
+    
+    if (isgn < 0) {
+        for (j = 0; j <= 7; j++) {
+            x0r = a[0][j] + a[7][j];
+            x1r = a[0][j] - a[7][j];
+            x0i = a[2][j] + a[5][j];
+            x1i = a[2][j] - a[5][j];
+            x2r = a[4][j] + a[3][j];
+            x3r = a[4][j] - a[3][j];
+            x2i = a[6][j] + a[1][j];
+            x3i = a[6][j] - a[1][j];
+            xr = x0r + x2r;
+            xi = x0i + x2i;
+            a[0][j] = C8_4R * (xr + xi);
+            a[4][j] = C8_4R * (xr - xi);
+            xr = x0r - x2r;
+            xi = x0i - x2i;
+            a[2][j] = C8_2R * xr - C8_2I * xi;
+            a[6][j] = C8_2R * xi + C8_2I * xr;
+            xr = W8_4R * (x1i - x3i);
+            x1i = W8_4R * (x1i + x3i);
+            x3i = x1i - x3r;
+            x1i += x3r;
+            x3r = x1r - xr;
+            x1r += xr;
+            a[1][j] = C8_1R * x1r - C8_1I * x1i;
+            a[7][j] = C8_1R * x1i + C8_1I * x1r;
+            a[3][j] = C8_3R * x3r - C8_3I * x3i;
+            a[5][j] = C8_3R * x3i + C8_3I * x3r;
+        }
+        for (j = 0; j <= 7; j++) {
+            x0r = a[j][0] + a[j][7];
+            x1r = a[j][0] - a[j][7];
+            x0i = a[j][2] + a[j][5];
+            x1i = a[j][2] - a[j][5];
+            x2r = a[j][4] + a[j][3];
+            x3r = a[j][4] - a[j][3];
+            x2i = a[j][6] + a[j][1];
+            x3i = a[j][6] - a[j][1];
+            xr = x0r + x2r;
+            xi = x0i + x2i;
+            a[j][0] = C8_4R * (xr + xi);
+            a[j][4] = C8_4R * (xr - xi);
+            xr = x0r - x2r;
+            xi = x0i - x2i;
+            a[j][2] = C8_2R * xr - C8_2I * xi;
+            a[j][6] = C8_2R * xi + C8_2I * xr;
+            xr = W8_4R * (x1i - x3i);
+            x1i = W8_4R * (x1i + x3i);
+            x3i = x1i - x3r;
+            x1i += x3r;
+            x3r = x1r - xr;
+            x1r += xr;
+            a[j][1] = C8_1R * x1r - C8_1I * x1i;
+            a[j][7] = C8_1R * x1i + C8_1I * x1r;
+            a[j][3] = C8_3R * x3r - C8_3I * x3i;
+            a[j][5] = C8_3R * x3i + C8_3I * x3r;
+        }
+    } else {
+        for (j = 0; j <= 7; j++) {
+            x1r = C8_1R * a[1][j] + C8_1I * a[7][j];
+            x1i = C8_1R * a[7][j] - C8_1I * a[1][j];
+            x3r = C8_3R * a[3][j] + C8_3I * a[5][j];
+            x3i = C8_3R * a[5][j] - C8_3I * a[3][j];
+            xr = x1r - x3r;
+            xi = x1i + x3i;
+            x1r += x3r;
+            x3i -= x1i;
+            x1i = W8_4R * (xr + xi);
+            x3r = W8_4R * (xr - xi);
+            xr = C8_2R * a[2][j] + C8_2I * a[6][j];
+            xi = C8_2R * a[6][j] - C8_2I * a[2][j];
+            x0r = C8_4R * (a[0][j] + a[4][j]);
+            x0i = C8_4R * (a[0][j] - a[4][j]);
+            x2r = x0r - xr;
+            x2i = x0i - xi;
+            x0r += xr;
+            x0i += xi;
+            a[0][j] = x0r + x1r;
+            a[7][j] = x0r - x1r;
+            a[2][j] = x0i + x1i;
+            a[5][j] = x0i - x1i;
+            a[4][j] = x2r - x3i;
+            a[3][j] = x2r + x3i;
+            a[6][j] = x2i - x3r;
+            a[1][j] = x2i + x3r;
+        }
+        for (j = 0; j <= 7; j++) {
+            x1r = C8_1R * a[j][1] + C8_1I * a[j][7];
+            x1i = C8_1R * a[j][7] - C8_1I * a[j][1];
+            x3r = C8_3R * a[j][3] + C8_3I * a[j][5];
+            x3i = C8_3R * a[j][5] - C8_3I * a[j][3];
+            xr = x1r - x3r;
+            xi = x1i + x3i;
+            x1r += x3r;
+            x3i -= x1i;
+            x1i = W8_4R * (xr + xi);
+            x3r = W8_4R * (xr - xi);
+            xr = C8_2R * a[j][2] + C8_2I * a[j][6];
+            xi = C8_2R * a[j][6] - C8_2I * a[j][2];
+            x0r = C8_4R * (a[j][0] + a[j][4]);
+            x0i = C8_4R * (a[j][0] - a[j][4]);
+            x2r = x0r - xr;
+            x2i = x0i - xi;
+            x0r += xr;
+            x0i += xi;
+            a[j][0] = x0r + x1r;
+            a[j][7] = x0r - x1r;
+            a[j][2] = x0i + x1i;
+            a[j][5] = x0i - x1i;
+            a[j][4] = x2r - x3i;
+            a[j][3] = x2r + x3i;
+            a[j][6] = x2i - x3r;
+            a[j][1] = x2i + x3r;
+        }
+    }
+}
+
+
+
+/*
+-------- 16x16 DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> Normalized 16x16 IDCT
+            C[k1][k2] = (1/8) * sum_j1=0^15 sum_j2=0^15 
+                            a[j1][j2] * s[j1] * s[j2] * 
+                            cos(pi*j1*(k1+1/2)/16) * 
+                            cos(pi*j2*(k2+1/2)/16), 0<=k1<16, 0<=k2<16
+                            (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
+        <case2> Normalized 16x16 DCT
+            C[k1][k2] = (1/8) * s[k1] * s[k2] * sum_j1=0^15 sum_j2=0^15 
+                            a[j1][j2] * 
+                            cos(pi*(j1+1/2)*k1/16) * 
+                            cos(pi*(j2+1/2)*k2/16), 0<=k1<16, 0<=k2<16
+                            (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
+    [usage]
+        <case1>
+            ddct16x16s(1, a);
+        <case2>
+            ddct16x16s(-1, a);
+    [parameters]
+        a[0...15][0...15] :input/output data (double **)
+                           output data
+                               a[k1][k2] = C[k1][k2], 0<=k1<16, 0<=k2<16
+*/
+
+
+/* Cn_kR = sqrt(2.0/n) * cos(pi/2*k/n) */
+/* Cn_kI = sqrt(2.0/n) * sin(pi/2*k/n) */
+/* Wn_kR = cos(pi/2*k/n) */
+/* Wn_kI = sin(pi/2*k/n) */
+#define C16_1R   0.35185093438159561476
+#define C16_1I   0.03465429229977286565
+#define C16_2R   0.34675996133053686546
+#define C16_2I   0.06897484482073575308
+#define C16_3R   0.33832950029358816957
+#define C16_3I   0.10263113188058934529
+#define C16_4R   0.32664074121909413196
+#define C16_4I   0.13529902503654924610
+#define C16_5R   0.31180625324666780814
+#define C16_5I   0.16666391461943662432
+#define C16_6R   0.29396890060483967924
+#define C16_6I   0.19642373959677554532
+#define C16_7R   0.27330046675043937206
+#define C16_7I   0.22429189658565907106
+#define C16_8R   0.25
+#define W16_4R   0.92387953251128675613
+#define W16_4I   0.38268343236508977173
+#define W16_8R   0.70710678118654752440
+
+
+void ddct16x16s(int isgn, double **a)
+{
+    int j;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    double x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i;
+    double xr, xi;
+    
+    if (isgn < 0) {
+        for (j = 0; j <= 15; j++) {
+            x4r = a[0][j] - a[15][j];
+            xr = a[0][j] + a[15][j];
+            x4i = a[8][j] - a[7][j];
+            xi = a[8][j] + a[7][j];
+            x0r = xr + xi;
+            x0i = xr - xi;
+            x5r = a[2][j] - a[13][j];
+            xr = a[2][j] + a[13][j];
+            x5i = a[10][j] - a[5][j];
+            xi = a[10][j] + a[5][j];
+            x1r = xr + xi;
+            x1i = xr - xi;
+            x6r = a[4][j] - a[11][j];
+            xr = a[4][j] + a[11][j];
+            x6i = a[12][j] - a[3][j];
+            xi = a[12][j] + a[3][j];
+            x2r = xr + xi;
+            x2i = xr - xi;
+            x7r = a[6][j] - a[9][j];
+            xr = a[6][j] + a[9][j];
+            x7i = a[14][j] - a[1][j];
+            xi = a[14][j] + a[1][j];
+            x3r = xr + xi;
+            x3i = xr - xi;
+            xr = x0r + x2r;
+            xi = x1r + x3r;
+            a[0][j] = C16_8R * (xr + xi);
+            a[8][j] = C16_8R * (xr - xi);
+            xr = x0r - x2r;
+            xi = x1r - x3r;
+            a[4][j] = C16_4R * xr - C16_4I * xi;
+            a[12][j] = C16_4R * xi + C16_4I * xr;
+            x0r = W16_8R * (x1i - x3i);
+            x2r = W16_8R * (x1i + x3i);
+            xr = x0i + x0r;
+            xi = x2r + x2i;
+            a[2][j] = C16_2R * xr - C16_2I * xi;
+            a[14][j] = C16_2R * xi + C16_2I * xr;
+            xr = x0i - x0r;
+            xi = x2r - x2i;
+            a[6][j] = C16_6R * xr - C16_6I * xi;
+            a[10][j] = C16_6R * xi + C16_6I * xr;
+            xr = W16_8R * (x6r - x6i);
+            xi = W16_8R * (x6i + x6r);
+            x6r = x4r - xr;
+            x6i = x4i - xi;
+            x4r += xr;
+            x4i += xi;
+            xr = W16_4I * x7r - W16_4R * x7i;
+            xi = W16_4I * x7i + W16_4R * x7r;
+            x7r = W16_4R * x5r - W16_4I * x5i;
+            x7i = W16_4R * x5i + W16_4I * x5r;
+            x5r = x7r + xr;
+            x5i = x7i + xi;
+            x7r -= xr;
+            x7i -= xi;
+            xr = x4r + x5r;
+            xi = x5i + x4i;
+            a[1][j] = C16_1R * xr - C16_1I * xi;
+            a[15][j] = C16_1R * xi + C16_1I * xr;
+            xr = x4r - x5r;
+            xi = x5i - x4i;
+            a[7][j] = C16_7R * xr - C16_7I * xi;
+            a[9][j] = C16_7R * xi + C16_7I * xr;
+            xr = x6r - x7i;
+            xi = x7r + x6i;
+            a[5][j] = C16_5R * xr - C16_5I * xi;
+            a[11][j] = C16_5R * xi + C16_5I * xr;
+            xr = x6r + x7i;
+            xi = x7r - x6i;
+            a[3][j] = C16_3R * xr - C16_3I * xi;
+            a[13][j] = C16_3R * xi + C16_3I * xr;
+        }
+        for (j = 0; j <= 15; j++) {
+            x4r = a[j][0] - a[j][15];
+            xr = a[j][0] + a[j][15];
+            x4i = a[j][8] - a[j][7];
+            xi = a[j][8] + a[j][7];
+            x0r = xr + xi;
+            x0i = xr - xi;
+            x5r = a[j][2] - a[j][13];
+            xr = a[j][2] + a[j][13];
+            x5i = a[j][10] - a[j][5];
+            xi = a[j][10] + a[j][5];
+            x1r = xr + xi;
+            x1i = xr - xi;
+            x6r = a[j][4] - a[j][11];
+            xr = a[j][4] + a[j][11];
+            x6i = a[j][12] - a[j][3];
+            xi = a[j][12] + a[j][3];
+            x2r = xr + xi;
+            x2i = xr - xi;
+            x7r = a[j][6] - a[j][9];
+            xr = a[j][6] + a[j][9];
+            x7i = a[j][14] - a[j][1];
+            xi = a[j][14] + a[j][1];
+            x3r = xr + xi;
+            x3i = xr - xi;
+            xr = x0r + x2r;
+            xi = x1r + x3r;
+            a[j][0] = C16_8R * (xr + xi);
+            a[j][8] = C16_8R * (xr - xi);
+            xr = x0r - x2r;
+            xi = x1r - x3r;
+            a[j][4] = C16_4R * xr - C16_4I * xi;
+            a[j][12] = C16_4R * xi + C16_4I * xr;
+            x0r = W16_8R * (x1i - x3i);
+            x2r = W16_8R * (x1i + x3i);
+            xr = x0i + x0r;
+            xi = x2r + x2i;
+            a[j][2] = C16_2R * xr - C16_2I * xi;
+            a[j][14] = C16_2R * xi + C16_2I * xr;
+            xr = x0i - x0r;
+            xi = x2r - x2i;
+            a[j][6] = C16_6R * xr - C16_6I * xi;
+            a[j][10] = C16_6R * xi + C16_6I * xr;
+            xr = W16_8R * (x6r - x6i);
+            xi = W16_8R * (x6i + x6r);
+            x6r = x4r - xr;
+            x6i = x4i - xi;
+            x4r += xr;
+            x4i += xi;
+            xr = W16_4I * x7r - W16_4R * x7i;
+            xi = W16_4I * x7i + W16_4R * x7r;
+            x7r = W16_4R * x5r - W16_4I * x5i;
+            x7i = W16_4R * x5i + W16_4I * x5r;
+            x5r = x7r + xr;
+            x5i = x7i + xi;
+            x7r -= xr;
+            x7i -= xi;
+            xr = x4r + x5r;
+            xi = x5i + x4i;
+            a[j][1] = C16_1R * xr - C16_1I * xi;
+            a[j][15] = C16_1R * xi + C16_1I * xr;
+            xr = x4r - x5r;
+            xi = x5i - x4i;
+            a[j][7] = C16_7R * xr - C16_7I * xi;
+            a[j][9] = C16_7R * xi + C16_7I * xr;
+            xr = x6r - x7i;
+            xi = x7r + x6i;
+            a[j][5] = C16_5R * xr - C16_5I * xi;
+            a[j][11] = C16_5R * xi + C16_5I * xr;
+            xr = x6r + x7i;
+            xi = x7r - x6i;
+            a[j][3] = C16_3R * xr - C16_3I * xi;
+            a[j][13] = C16_3R * xi + C16_3I * xr;
+        }
+    } else {
+        for (j = 0; j <= 15; j++) {
+            x5r = C16_1R * a[1][j] + C16_1I * a[15][j];
+            x5i = C16_1R * a[15][j] - C16_1I * a[1][j];
+            xr = C16_7R * a[7][j] + C16_7I * a[9][j];
+            xi = C16_7R * a[9][j] - C16_7I * a[7][j];
+            x4r = x5r + xr;
+            x4i = x5i - xi;
+            x5r -= xr;
+            x5i += xi;
+            x7r = C16_5R * a[5][j] + C16_5I * a[11][j];
+            x7i = C16_5R * a[11][j] - C16_5I * a[5][j];
+            xr = C16_3R * a[3][j] + C16_3I * a[13][j];
+            xi = C16_3R * a[13][j] - C16_3I * a[3][j];
+            x6r = x7r + xr;
+            x6i = x7i - xi;
+            x7r -= xr;
+            x7i += xi;
+            xr = x4r - x6r;
+            xi = x4i - x6i;
+            x4r += x6r;
+            x4i += x6i;
+            x6r = W16_8R * (xi + xr);
+            x6i = W16_8R * (xi - xr);
+            xr = x5r + x7i;
+            xi = x5i - x7r;
+            x5r -= x7i;
+            x5i += x7r;
+            x7r = W16_4I * x5r + W16_4R * x5i;
+            x7i = W16_4I * x5i - W16_4R * x5r;
+            x5r = W16_4R * xr + W16_4I * xi;
+            x5i = W16_4R * xi - W16_4I * xr;
+            xr = C16_4R * a[4][j] + C16_4I * a[12][j];
+            xi = C16_4R * a[12][j] - C16_4I * a[4][j];
+            x2r = C16_8R * (a[0][j] + a[8][j]);
+            x3r = C16_8R * (a[0][j] - a[8][j]);
+            x0r = x2r + xr;
+            x1r = x3r + xi;
+            x2r -= xr;
+            x3r -= xi;
+            x0i = C16_2R * a[2][j] + C16_2I * a[14][j];
+            x2i = C16_2R * a[14][j] - C16_2I * a[2][j];
+            x1i = C16_6R * a[6][j] + C16_6I * a[10][j];
+            x3i = C16_6R * a[10][j] - C16_6I * a[6][j];
+            xr = x0i - x1i;
+            xi = x2i + x3i;
+            x0i += x1i;
+            x2i -= x3i;
+            x1i = W16_8R * (xi + xr);
+            x3i = W16_8R * (xi - xr);
+            xr = x0r + x0i;
+            xi = x0r - x0i;
+            a[0][j] = xr + x4r;
+            a[15][j] = xr - x4r;
+            a[8][j] = xi + x4i;
+            a[7][j] = xi - x4i;
+            xr = x1r + x1i;
+            xi = x1r - x1i;
+            a[2][j] = xr + x5r;
+            a[13][j] = xr - x5r;
+            a[10][j] = xi + x5i;
+            a[5][j] = xi - x5i;
+            xr = x2r + x2i;
+            xi = x2r - x2i;
+            a[4][j] = xr + x6r;
+            a[11][j] = xr - x6r;
+            a[12][j] = xi + x6i;
+            a[3][j] = xi - x6i;
+            xr = x3r + x3i;
+            xi = x3r - x3i;
+            a[6][j] = xr + x7r;
+            a[9][j] = xr - x7r;
+            a[14][j] = xi + x7i;
+            a[1][j] = xi - x7i;
+        }
+        for (j = 0; j <= 15; j++) {
+            x5r = C16_1R * a[j][1] + C16_1I * a[j][15];
+            x5i = C16_1R * a[j][15] - C16_1I * a[j][1];
+            xr = C16_7R * a[j][7] + C16_7I * a[j][9];
+            xi = C16_7R * a[j][9] - C16_7I * a[j][7];
+            x4r = x5r + xr;
+            x4i = x5i - xi;
+            x5r -= xr;
+            x5i += xi;
+            x7r = C16_5R * a[j][5] + C16_5I * a[j][11];
+            x7i = C16_5R * a[j][11] - C16_5I * a[j][5];
+            xr = C16_3R * a[j][3] + C16_3I * a[j][13];
+            xi = C16_3R * a[j][13] - C16_3I * a[j][3];
+            x6r = x7r + xr;
+            x6i = x7i - xi;
+            x7r -= xr;
+            x7i += xi;
+            xr = x4r - x6r;
+            xi = x4i - x6i;
+            x4r += x6r;
+            x4i += x6i;
+            x6r = W16_8R * (xi + xr);
+            x6i = W16_8R * (xi - xr);
+            xr = x5r + x7i;
+            xi = x5i - x7r;
+            x5r -= x7i;
+            x5i += x7r;
+            x7r = W16_4I * x5r + W16_4R * x5i;
+            x7i = W16_4I * x5i - W16_4R * x5r;
+            x5r = W16_4R * xr + W16_4I * xi;
+            x5i = W16_4R * xi - W16_4I * xr;
+            xr = C16_4R * a[j][4] + C16_4I * a[j][12];
+            xi = C16_4R * a[j][12] - C16_4I * a[j][4];
+            x2r = C16_8R * (a[j][0] + a[j][8]);
+            x3r = C16_8R * (a[j][0] - a[j][8]);
+            x0r = x2r + xr;
+            x1r = x3r + xi;
+            x2r -= xr;
+            x3r -= xi;
+            x0i = C16_2R * a[j][2] + C16_2I * a[j][14];
+            x2i = C16_2R * a[j][14] - C16_2I * a[j][2];
+            x1i = C16_6R * a[j][6] + C16_6I * a[j][10];
+            x3i = C16_6R * a[j][10] - C16_6I * a[j][6];
+            xr = x0i - x1i;
+            xi = x2i + x3i;
+            x0i += x1i;
+            x2i -= x3i;
+            x1i = W16_8R * (xi + xr);
+            x3i = W16_8R * (xi - xr);
+            xr = x0r + x0i;
+            xi = x0r - x0i;
+            a[j][0] = xr + x4r;
+            a[j][15] = xr - x4r;
+            a[j][8] = xi + x4i;
+            a[j][7] = xi - x4i;
+            xr = x1r + x1i;
+            xi = x1r - x1i;
+            a[j][2] = xr + x5r;
+            a[j][13] = xr - x5r;
+            a[j][10] = xi + x5i;
+            a[j][5] = xi - x5i;
+            xr = x2r + x2i;
+            xi = x2r - x2i;
+            a[j][4] = xr + x6r;
+            a[j][11] = xr - x6r;
+            a[j][12] = xi + x6i;
+            a[j][3] = xi - x6i;
+            xr = x3r + x3i;
+            xi = x3r - x3i;
+            a[j][6] = xr + x7r;
+            a[j][9] = xr - x7r;
+            a[j][14] = xi + x7i;
+            a[j][1] = xi - x7i;
+        }
+    }
+}
+