/*
 * Copyright (c) 1997-1999, 2003 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#include "fftw_threads-int.h"
#include "rfftw_threads.h"

/****************** prototypes for rexec2_threads routines *****************/

extern void rfftw_real2c_threads_aux(fftw_plan plan, int howmany,
				     fftw_real *in, int istride, int idist,
				     fftw_complex *out, int ostride, int odist,
				     fftw_real *work,
				     int nthreads);
extern void rfftw_c2real_threads_aux(fftw_plan plan, int howmany,
				     fftw_complex *in, int istride, int idist,
				     fftw_real *out, int ostride, int odist,
				     fftw_real *work,
				     int nthreads);
extern void rfftw_real2c_overlap_threads_aux(fftw_plan plan, int howmany,
					     fftw_real *in, int istride,
					     int idist,
					     fftw_complex *out,
					     int ostride, int odist,
					     fftw_real *work,
					     int nthreads);
extern void rfftw_c2real_overlap_threads_aux(fftw_plan plan, int howmany,
					     fftw_complex *in,
					     int istride, int idist,
					     fftw_real *out, 
					     int ostride, int odist,
					     fftw_real *work,
					     int nthreads);

/********************** prototypes for rexec2 routines **********************/

extern void rfftw_real2c_aux(fftw_plan plan, int howmany,
			     fftw_real *in, int istride, int idist,
			     fftw_complex *out, int ostride, int odist,
			     fftw_real *work);
extern void rfftw_c2real_aux(fftw_plan plan, int howmany,
			     fftw_complex *in, int istride, int idist,
			     fftw_real *out, int ostride, int odist,
			     fftw_real *work);
extern void rfftw_real2c_overlap_aux(fftw_plan plan, int howmany,
				   fftw_real *in, int istride, int idist,
			       fftw_complex *out, int ostride, int odist,
				     fftw_real *work);
extern void rfftw_c2real_overlap_aux(fftw_plan plan, int howmany,
				fftw_complex *in, int istride, int idist,
				  fftw_real *out, int ostride, int odist,
				     fftw_real *work);

/****************** prototypes for rfftwnd routines *****************/

extern void rfftwnd_real2c_aux(fftwnd_plan p, int cur_dim,
			       fftw_real *in, int istride,
			       fftw_complex *out, int ostride,
			       fftw_real *work);
extern void rfftwnd_c2real_aux(fftwnd_plan p, int cur_dim,
			       fftw_complex *in, int istride,
			       fftw_real *out, int ostride,
			       fftw_real *work);
extern void rfftwnd_real2c_aux_howmany(fftwnd_plan p, int cur_dim,
                                int howmany,
                                fftw_real *in, int istride, int idist,
                                fftw_complex *out, int ostride, int odist,
                                fftw_complex *work);
extern void rfftwnd_c2real_aux_howmany(fftwnd_plan p, int cur_dim,
                                int howmany,
                                fftw_complex *in, int istride, int idist,
                                fftw_real *out, int ostride, int odist,
                                fftw_complex *work);


/*********** Computing the N-Dimensional FFT: Auxiliary Routines ************/

typedef struct {
     fftwnd_plan p;
     int cur_dim;
     void *in;
     int istride, idist;
     void *out;
     int ostride, odist;
     fftw_complex *work;
} aux_data;

static void *real2c_aux_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     aux_data *d = (aux_data *) ldata->data;
     fftwnd_plan p = d->p;
     int cur_dim = d->cur_dim;
     fftw_real *in = (fftw_real *) d->in;
     int istride = d->istride, idist = d->idist;
     fftw_complex *out = (fftw_complex *) d->out;
     int ostride = d->ostride, odist = d->odist;
     fftw_real *work = (fftw_real*) (d->work + p->nwork * ldata->thread_num);

     for (; min < max; ++min)
	  rfftwnd_real2c_aux(p, cur_dim, in + idist * min, istride,
			     out + odist * min, ostride, work);
     return 0;
}

void rfftwnd_real2c_threads_aux(fftwnd_plan p, int cur_dim,
				fftw_real *in, int istride,
				fftw_complex *out, int ostride,
				fftw_complex *work,
				int nthreads)
{
     int n_after = p->n_after[cur_dim], n = p->n[cur_dim];

     if (cur_dim == p->rank - 2) {
	  /* just do the last dimension directly: */
	  if (p->is_in_place)
	       rfftw_real2c_threads_aux(p->plans[p->rank - 1], n,
					in, istride, (n_after * istride) * 2,
					out, istride, n_after * istride,
					(fftw_real *) work, nthreads);
	  else
	       rfftw_real2c_threads_aux(p->plans[p->rank - 1], n,
					in, istride,
					p->plans[p->rank - 1]->n * istride,
					out, ostride, n_after * ostride,
					(fftw_real *) work, nthreads);
     }
     else {    /* we have at least two dimensions to go */
	  int nr = p->plans[p->rank - 1]->n;
	  aux_data d;

	  d.p = p;
	  d.cur_dim = cur_dim + 1;
	  d.in = in;
	  d.istride = istride;
	  d.idist = istride * (p->is_in_place ? n_after * 2
			       : nr * (n_after / (nr/2 + 1)));
	  d.out = out;
	  d.ostride = ostride;
	  d.odist = ostride * n_after;
	  d.work = work;

	  fftw_thread_spawn_loop(n, nthreads, real2c_aux_thread, &d);
     }

     /* do the current dimension (in-place): */
     /* (Use internal function instead of fftw_threads so that we can
	pass our workspace array.) */
     fftw_executor_many_inplace_threads(p->plans[cur_dim]->n,
					out, work, p->plans[cur_dim]->root,
					n_after * ostride, n_after, ostride,
					nthreads);
}

static void *c2real_aux_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     aux_data *d = (aux_data *) ldata->data;
     fftwnd_plan p = d->p;
     int cur_dim = d->cur_dim;
     fftw_complex *in = (fftw_complex *) d->in;
     int istride = d->istride, idist = d->idist;
     fftw_real *out = (fftw_real *) d->out;
     int ostride = d->ostride, odist = d->odist;
     fftw_real *work = (fftw_real*) (d->work + p->nwork * ldata->thread_num);

     for (; min < max; ++min)
	  rfftwnd_c2real_aux(p, cur_dim, in + idist * min, istride,
			     out + odist * min, ostride, work);
     return 0;
}

void rfftwnd_c2real_threads_aux(fftwnd_plan p, int cur_dim,
				fftw_complex *in, int istride,
				fftw_real *out, int ostride,
				fftw_complex *work, int nthreads)
{
     int n_after = p->n_after[cur_dim], n = p->n[cur_dim];

     /* do the current dimension (in-place): */
     /* (Use internal function instead of fftw_threads so that we can
	pass our workspace array.) */
     fftw_executor_many_inplace_threads(p->plans[cur_dim]->n,
					in, work, p->plans[cur_dim]->root,
					n_after * istride, n_after, istride,
					nthreads);

     if (cur_dim == p->rank - 2) {
	  /* just do the last dimension directly: */
	  if (p->is_in_place)
	       rfftw_c2real_threads_aux(p->plans[p->rank - 1], n,
					in, istride, n_after * istride,
					out, istride, (n_after * istride) * 2,
					(fftw_real *) work, nthreads);
	  else
	       rfftw_c2real_threads_aux(p->plans[p->rank - 1], n,
					in, istride, n_after * istride,
					out, ostride,
					p->plans[p->rank - 1]->n * ostride,
					(fftw_real *) work, nthreads);
     }
     else {	/* we have at least two dimensions to go */
	  int nr = p->plans[p->rank - 1]->n;
	  aux_data d;

	  d.p = p;
	  d.cur_dim = cur_dim + 1;
	  d.in = in;
	  d.istride = istride;
	  d.odist = ostride * (p->is_in_place ? n_after * 2
			       : nr * (n_after / (nr/2 + 1)));
	  d.out = out;
	  d.ostride = ostride;
	  d.idist = istride * n_after;
	  d.work = work;

	  fftw_thread_spawn_loop(n, nthreads, c2real_aux_thread, &d);
     }
}

typedef struct {
     fftw_plan p;
     int howmany;
     void *in;
     int istride, idist, idist0;
     void *out;
     int ostride, odist, odist0;
     fftw_real *work;
     int wdist;
} howmany_aux_data;

static void *r2c_overlap_howmany_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     howmany_aux_data *d = (howmany_aux_data *) ldata->data;
     fftw_plan p = d->p;
     int howmany = d->howmany;
     fftw_real *in = (fftw_real *) d->in;
     int istride = d->istride, idist = d->idist, idist0 = d->idist0;
     fftw_complex *out = (fftw_complex *) d->out;
     int ostride = d->ostride, odist = d->odist, odist0 = d->odist0;
     fftw_real *work = d->work + d->wdist * ldata->thread_num;

     for (; min < max; ++min)
	  rfftw_real2c_overlap_aux(p, howmany,
				   in + min * idist0, istride, idist,
				   out + min * odist0, ostride, odist,
				   work);

     return 0;
}

static void *c2r_overlap_howmany_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     howmany_aux_data *d = (howmany_aux_data *) ldata->data;
     fftw_plan p = d->p;
     int howmany = d->howmany;
     fftw_real *out = (fftw_real *) d->out;
     int istride = d->istride, idist = d->idist, idist0 = d->idist0;
     fftw_complex *in = (fftw_complex *) d->in;
     int ostride = d->ostride, odist = d->odist, odist0 = d->odist0;
     fftw_real *work = d->work + d->wdist * ldata->thread_num;

     for (; min < max; ++min)
	  rfftw_c2real_overlap_aux(p, howmany,
				   in + min * idist0, istride, idist,
				   out + min * odist0, ostride, odist,
				   work);

     return 0;
}

static void *r2c_howmany_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     howmany_aux_data *d = (howmany_aux_data *) ldata->data;
     fftw_plan p = d->p;
     int howmany = d->howmany;
     fftw_real *in = (fftw_real *) d->in;
     int istride = d->istride, idist = d->idist, idist0 = d->idist0;
     fftw_complex *out = (fftw_complex *) d->out;
     int ostride = d->ostride, odist = d->odist, odist0 = d->odist0;
     fftw_real *work = d->work + d->wdist * ldata->thread_num;

     for (; min < max; ++min)
	  rfftw_real2c_aux(p, howmany,
			   in + min * idist0, istride, idist,
			   out + min * odist0, ostride, odist,
			   work);

     return 0;
}

static void *c2r_howmany_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     howmany_aux_data *d = (howmany_aux_data *) ldata->data;
     fftw_plan p = d->p;
     int howmany = d->howmany;
     fftw_real *out = (fftw_real *) d->out;
     int istride = d->istride, idist = d->idist, idist0 = d->idist0;
     fftw_complex *in = (fftw_complex *) d->in;
     int ostride = d->ostride, odist = d->odist, odist0 = d->odist0;
     fftw_real *work = d->work + d->wdist * ldata->thread_num;

     for (; min < max; ++min)
	  rfftw_c2real_aux(p, howmany,
			   in + min * idist0, istride, idist,
			   out + min * odist0, ostride, odist,
			   work);

     return 0;
}

typedef struct {
     fftwnd_plan p;
     int cur_dim;
     int howmany;
     void *in;
     int istride, idist, idist0;
     void *out;
     int ostride, odist, odist0;
     fftw_complex *work;
     int wdist;
} howmany_hyperslab_aux_data;

static void *r2c_hyperslab_howmany_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     howmany_hyperslab_aux_data *d = (howmany_hyperslab_aux_data*) ldata->data;
     fftwnd_plan p = d->p;
     int cur_dim = d->cur_dim;
     int howmany = d->howmany;
     fftw_real *in = (fftw_real *) d->in;
     int istride = d->istride, idist = d->idist, idist0 = d->idist0;
     fftw_complex *out = (fftw_complex *) d->out;
     int ostride = d->ostride, odist = d->odist, odist0 = d->odist0;
     fftw_complex *work = d->work + d->wdist * ldata->thread_num;

     for (; min < max; ++min)
	  rfftwnd_real2c_aux_howmany(p, cur_dim, howmany,
				     in + min * idist0, istride, idist,
				     out + min * odist0, ostride, odist,
				     work);

     return 0;
}

static void *c2r_hyperslab_howmany_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     howmany_hyperslab_aux_data *d = (howmany_hyperslab_aux_data*) ldata->data;
     fftwnd_plan p = d->p;
     int cur_dim = d->cur_dim;
     int howmany = d->howmany;
     fftw_real *out = (fftw_real *) d->out;
     int istride = d->istride, idist = d->idist, idist0 = d->idist0;
     fftw_complex *in = (fftw_complex *) d->in;
     int ostride = d->ostride, odist = d->odist, odist0 = d->odist0;
     fftw_complex *work = d->work + d->wdist * ldata->thread_num;

     for (; min < max; ++min)
	  rfftwnd_c2real_aux_howmany(p, cur_dim, howmany,
				     in + min * idist0, istride, idist,
				     out + min * odist0, ostride, odist,
				     work);

     return 0;
}

typedef struct {
     fftw_plan p;
     int howmany;
     fftw_complex *io_data;
     int iostride, iodist, iodist0;
     fftw_complex *work;
     int wdist;
} fftw_howmany_data;

static void *fftw_howmany_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     fftw_howmany_data *d = (fftw_howmany_data*) ldata->data;
     fftw_plan p = d->p;
     int howmany = d->howmany;
     fftw_complex *io_data = d->io_data;
     int iostride = d->iostride, iodist = d->iodist, iodist0 = d->iodist0;
     fftw_complex *work = d->work + d->wdist * ldata->thread_num;

     for (; min < max; ++min)
	  fftw(p, howmany, io_data + min*iodist0, iostride, iodist, work,1,0);

     return 0;
}

/*
 * alternate version of rfftwnd_aux -- this version pushes the howmany
 * loop down to the leaves of the computation, for greater locality
 * in cases where dist < stride.  It is also required for correctness
 * if in==out, and we must call a special version of the executor.
 * Note that work must point to 'howmany' copies of its data
 * if in == out. 
 */

void rfftwnd_real2c_aux_howmany_threads(fftwnd_plan p, int cur_dim,
					int howmany,
					fftw_real *in, int istride, int idist,
					fftw_complex *out,
					int ostride, int odist,
					fftw_complex *work, int nwork,
					int nthreads)
{
     int n_after = p->n_after[cur_dim], n = p->n[cur_dim];

     if (cur_dim == p->rank - 2) {
	  howmany_aux_data d;

	  d.p = p->plans[p->rank - 1];
	  d.howmany = howmany;
	  d.in = in;
	  d.istride = istride; d.idist = idist;
	  d.out = out;
	  d.ostride = ostride; d.odist = odist;
	  d.work = (fftw_real *) work;
	  d.wdist = nwork * 2;

	  /* just do the last dimension directly: */
	  if (p->is_in_place) {
	       d.idist0 =  n_after * istride * 2;
	       d.odist0 = n_after * ostride;
	       fftw_thread_spawn_loop(n, nthreads,
				      r2c_overlap_howmany_thread, &d);
	  }
	  else {
	       d.idist0 = p->plans[p->rank - 1]->n * istride;
	       d.odist0 = n_after * ostride;
	       fftw_thread_spawn_loop(n, nthreads,
				      r2c_howmany_thread, &d);
	  }
     } 
     else {      /* we have at least two dimensions to go */
	  /* 
	   * process the subsequent dimensions recursively, in hyperslabs,
	   * to get maximum locality: 
	   */

	  int nr = p->plans[p->rank - 1]->n;
	  int n_after_r = p->is_in_place ? n_after * 2 : 
	       nr * (n_after / (nr/2 + 1));
	  howmany_hyperslab_aux_data d;

	  d.p = p;
	  d.cur_dim = cur_dim + 1;
	  d.howmany = howmany;
	  d.in = in;
	  d.istride = istride;
	  d.idist = idist;
	  d.idist0 = n_after_r * istride;
	  d.out = out;
	  d.ostride = ostride;
	  d.odist = odist;
	  d.odist0 = n_after * ostride;
	  d.work = work;
	  d.wdist = nwork;

	  fftw_thread_spawn_loop(n, nthreads,
				 r2c_hyperslab_howmany_thread, &d);
     }

     /* do the current dimension (in-place): */
     {
	  fftw_howmany_data d;

	  d.p = p->plans[cur_dim];
	  d.howmany = howmany;
	  d.io_data = out;
	  d.iostride = n_after * ostride;
	  d.iodist = odist;
	  d.iodist0 = ostride;
	  d.work =  work;
	  d.wdist = nwork;

	  fftw_thread_spawn_loop(n_after, nthreads, fftw_howmany_thread, &d);
     }
}

void rfftwnd_c2real_aux_howmany_threads(fftwnd_plan p, int cur_dim,
					int howmany,
					fftw_complex *in,
					int istride, int idist,
					fftw_real *out,
					int ostride, int odist,
					fftw_complex *work, int nwork,
					int nthreads)
{
     int n_after = p->n_after[cur_dim], n = p->n[cur_dim];

     /* do the current dimension (in-place): */
     {
          fftw_howmany_data d;

          d.p = p->plans[cur_dim];
          d.howmany = howmany;
          d.io_data = in;
          d.iostride = n_after * istride;
          d.iodist = idist;
          d.iodist0 = istride;
          d.work = work;
	  d.wdist = nwork;

          fftw_thread_spawn_loop(n_after, nthreads, fftw_howmany_thread, &d);
     }

     if (cur_dim == p->rank - 2) {
          howmany_aux_data d;

          d.p = p->plans[p->rank - 1];
          d.howmany = howmany;
          d.in = in;
          d.istride = istride; d.idist = idist;
          d.out = out;
          d.ostride = ostride; d.odist = odist;
          d.work = (fftw_real *) work;
	  d.wdist = nwork * 2;

	  /* just do the last dimension directly: */
          if (p->is_in_place) {
               d.idist0 = n_after * istride;
               d.odist0 = n_after * ostride * 2;
               fftw_thread_spawn_loop(n, nthreads,
                                      c2r_overlap_howmany_thread, &d);
          }
          else {
               d.odist0 = p->plans[p->rank - 1]->n * ostride;
               d.idist0 = n_after * istride;
               fftw_thread_spawn_loop(n, nthreads,
                                      c2r_howmany_thread, &d);
          }
     } 
     else {			/* we have at least two dimensions to go */
          /*
           * process the subsequent dimensions recursively, in hyperslabs,
           * to get maximum locality:
           */

          int nr = p->plans[p->rank - 1]->n;
          int n_after_r = p->is_in_place ? n_after * 2 :
               nr * (n_after / (nr/2 + 1));
          howmany_hyperslab_aux_data d;

          d.p = p;
          d.cur_dim = cur_dim + 1;
          d.howmany = howmany;
          d.in = in;
          d.istride = istride;
          d.idist = idist;
          d.idist0 = n_after * istride;
          d.out = out;
          d.ostride = ostride;
          d.odist = odist;
          d.odist0 = n_after_r * ostride;
          d.work = work;
	  d.wdist = nwork;

          fftw_thread_spawn_loop(n, nthreads,
                                 c2r_hyperslab_howmany_thread, &d);
     }
}

/********** Computing the N-Dimensional FFT: User-Visible Routines **********/

void rfftwnd_threads_real_to_complex(int nthreads, fftwnd_plan p, int howmany,
				     fftw_real *in, int istride, int idist,
				     fftw_complex *out, int ostride, int odist)
{
     fftw_complex *work = 0;
     int rank = p->rank;
     int nwork = p->nwork, size_work = nwork * nthreads;

     if (p->dir != FFTW_REAL_TO_COMPLEX)
	  fftw_die("rfftwnd_real_to_complex with complex-to-real plan");

     if (p->is_in_place) {
	  ostride = istride;
	  odist = (idist == 1) ? 1 : (idist / 2);	/* ugh */
	  out = (fftw_complex *) in;
	  if (howmany > 1 && istride > idist && rank > 0) {
	       int new_nwork = p->n[rank - 1] * howmany;
	       if (new_nwork > nwork)
		    nwork = new_nwork;
	       if (rank != 1) {
		    if (nwork * nthreads > size_work)
			 size_work = nwork * nthreads;
	       }
	       else
		    size_work = nwork;
	  }
     }

     work = (fftw_complex *) fftw_malloc(sizeof(fftw_complex) * size_work);

     switch (rank) {
	 case 0:
	      break;
	 case 1:
	      if (p->is_in_place && howmany > 1 && istride > idist)
		   rfftw_real2c_overlap_threads_aux(p->plans[0], howmany,
						    in, istride, idist,
						    out, ostride, odist,
						    (fftw_real *) work,
						    nthreads);
	      else
		   rfftw_real2c_threads_aux(p->plans[0], howmany,
					    in, istride, idist,
					    out, ostride, odist,
					    (fftw_real *) work, nthreads);
	      break;
	 default:		/* rank >= 2 */
	      {
		   if (howmany > 1 && ostride > odist)
			rfftwnd_real2c_aux_howmany_threads(p, 0, howmany,
							   in, istride, idist,
							   out, ostride, odist,
							   work, nwork,
							   nthreads);
		   else {
			int i;

			for (i = 0; i < howmany; ++i)
			     rfftwnd_real2c_threads_aux(p, 0,
							in + i * idist,
							istride,
							out + i * odist,
							ostride,
							work,
							nthreads);
		   }
	      }
     }

     fftw_free(work);
}

void rfftwnd_threads_complex_to_real(int nthreads, fftwnd_plan p, int howmany,
				     fftw_complex *in, int istride, int idist,
				     fftw_real *out, int ostride, int odist)
{
     fftw_complex *work = 0;
     int rank = p->rank;
     int nwork = p->nwork, size_work = nwork * nthreads;

     if (p->dir != FFTW_COMPLEX_TO_REAL)
	  fftw_die("rfftwnd_complex_to_real with real-to-complex plan");

     if (p->is_in_place) {
	  ostride = istride;
	  odist = idist;
	  odist = (idist == 1) ? 1 : (idist * 2);	/* ugh */
	  out = (fftw_real *) in;
	  if (howmany > 1 && istride > idist && rank > 0) {
	       int new_nwork = p->n[rank - 1] * howmany;
	       if (new_nwork > nwork)
		    nwork = new_nwork;
	       if (rank != 1) {
		    if (nwork * nthreads > size_work)
			 size_work = nwork * nthreads;
	       }
	       else
		    size_work = nwork;
	  }
     }

     work = (fftw_complex *) fftw_malloc(sizeof(fftw_complex) * size_work);

     switch (rank) {
	 case 0:
	      break;
	 case 1:
              if (p->is_in_place && howmany > 1 && istride > idist)
                   rfftw_c2real_overlap_threads_aux(p->plans[0], howmany,
                                                    in, istride, idist,
                                                    out, ostride, odist,
                                                    (fftw_real *) work,
                                                    nthreads);
              else
                   rfftw_c2real_threads_aux(p->plans[0], howmany,
                                            in, istride, idist,
                                            out, ostride, odist,
                                            (fftw_real *) work, nthreads);
	      break;
	 default:		/* rank >= 2 */
	      {
		   if (howmany > 1 && ostride > odist)
                       rfftwnd_c2real_aux_howmany_threads(p, 0, howmany,
                                                           in, istride, idist,
                                                           out, ostride, odist,
                                                           work, nwork,
                                                           nthreads);
		   else {
			int i;

			for (i = 0; i < howmany; ++i)
                             rfftwnd_c2real_threads_aux(p, 0,
                                                        in + i * idist,
                                                        istride,
                                                        out + i * odist,
                                                        ostride,
                                                        work,
                                                        nthreads);
		   }
	      }
     }

     fftw_free(work);
}

void rfftwnd_threads_one_real_to_complex(int nthreads, fftwnd_plan p,
					 fftw_real *in, fftw_complex *out)
{
     rfftwnd_threads_real_to_complex(nthreads, p, 1, in, 1, 1, out, 1, 1);
}

void rfftwnd_threads_one_complex_to_real(int nthreads, fftwnd_plan p,
					 fftw_complex *in, fftw_real *out)
{
     rfftwnd_threads_complex_to_real(nthreads, p, 1, in, 1, 1, out, 1, 1);
}


syntax highlighted by Code2HTML, v. 0.9.1