/* Elliptic Curve Method implementation: stage 2 routines.

  Copyright 2002, 2003, 2004, 2005 Paul Zimmermann and Alexander Kruppa.

  This file is part of the ECM Library.

  The ECM Library is free software; you can redistribute it and/or modify
  it under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or (at your
  option) any later version.

  The ECM Library is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with the ECM Library; see the file COPYING.LIB.  If not, write to
  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  MA 02111-1307, USA.
*/

#include <stdlib.h>
#include <math.h>
#include "ecm-impl.h"

/* R_i <- q_i * S, 0 <= i < n, where q_i are large integers, S is a point on
   an elliptic curve. Uses max(bits in q_i) modular inversions (one less if 
   max(q_i) is a power of 2). Needs up to n+2 cells in T.
   Returns whether factor was found or not found, factor goes into p. 
   No error can occur.
*/

static int
multiplyW2n (mpz_t p, point *R, curve *S, mpz_t *q, const unsigned int n, 
              mpmod_t modulus, mpres_t u, mpres_t v, mpres_t *T,
              unsigned long *tot_muls, unsigned long *tot_gcds)
{
  unsigned int i, maxbit, k; /* k is the number of values to batch invert */
  unsigned int l, t, muls = 0, gcds = 0;
#ifdef WANT_EXPCOST
  unsigned int hamweight = 0;
#endif
  int youpi = ECM_NO_FACTOR_FOUND;
  mpz_t flag; /* Used as bit field, keeps track of which R[i] contain partial results */
  point s;    /* 2^t * S */
  mpz_t signs; /* Used as bit field, i-th bit is set iff q[i]<0 */
  mpz_t __dummy; /* used for local computations */

  if (n == 0)
    return ECM_NO_FACTOR_FOUND;
  
  /* Is S the neutral element ? */
  if (mpres_is_zero (S->x, modulus) && mpres_is_zero (S->y, modulus))
    {
      for (i = 0; i < n; i++)
        {
          mpres_set (R[i].x, S->x, modulus);
          mpres_set (R[i].y, S->y, modulus);
        }
      return ECM_NO_FACTOR_FOUND;
    }
  
  MPZ_INIT2 (flag, n);
  MPZ_INIT2 (signs, n);
  mpres_init (s.x, modulus);
  mpres_init (s.y, modulus);
  mpres_set (s.x, S->x, modulus);
  mpres_set (s.y, S->y, modulus);

  /* Set maxbit to index of highest set bit among all the q[i] */
  /* Index of highest bit of q is sizeinbase(q, 2) - 1 */
  maxbit = 0;
  for (i = 0; i < n; i++)
    {
      /* We'll first compute positive multiples and change signs later */
      if (mpz_sgn (q[i]) < 0)
        {
          mpz_setbit (signs, i);;
          mpz_neg (q[i], q[i]);
        }

      /* Multiplier == 0? Then set result to neutral element */
      if (mpz_sgn (q[i]) == 0)
        {
           mpres_set_ui (R[i].x, 0, modulus);
           mpres_set_ui (R[i].y, 0, modulus);
        }
#ifdef WANT_EXPCOST
      else
        hamweight += mpz_popcount (q[i]) - 1;
#endif
      if ((t = mpz_sizeinbase (q[i], 2) - 1) > maxbit)
          maxbit = t;
    }

#ifdef WANT_EXPCOST
  outputf (OUTPUT_ALWAYS, "Expecting %d multiplications and %d extgcds\n", 
          4 * (maxbit) + 6 * hamweight - 3, maxbit + 1);      /* maxbit is floor(log_2(max(q_i))) */
#endif

  for (t = 0; t <= maxbit && !youpi; t++)    /* Examine t-th bit of the q[i] */
    {
      /* See which values need inverting and put them into T[]. Keep number
         of those values in k */
      k = 0;
      
      /* Will we have to double s at the end of this pass? If yes,  
         schedule 2*s.y for inverting */
      if (t < maxbit)
        mpres_add (T[k++], s.y, s.y, modulus);
      
      for (i = 0; i < n && !youpi; i++) 
        if (mpz_tstbit (q[i], t))       /* If q[i] & (1<<t), we'll add s to R[i] */
          if (mpz_tstbit (flag, i))     /* Does R[i] contain a partial result yet ? */
            {                           /* If Yes: need actual point addition so */
              mpres_sub (T[k], s.x, R[i].x, modulus); /* schedule (s.x-R[i].x) for inverting */
              if (k > 0)
                mpres_mul (T[k], T[k], T[k - 1], modulus);
              k++;
            }                           /* If No: we'll simply set R[i] to s later on, nothing tbd here */
      
      /* So there are k values in need of inverting, call them v[m], 0 <= m < k. */      
      /* Here T[m], 0 <= m < k, contains v[0]*...*v[m] */
      
      /* Put inverse of the product of all scheduled values in T[k]*/
      if (k > 0)
        {
          muls += 3 * (k - 1);
          gcds++;
          if (!mpres_invert (T[k], T[k - 1], modulus))
            {
              /* If a factor was found, put factor in p, 
                 flag success and bail out of loop */
              mpres_gcd (p, T[k - 1], modulus);
              youpi = ECM_FACTOR_FOUND_STEP2;
              break;
            }
        }
      
      /* T[k] now contains 1/(v[0]*...*v[k - 1]), 
         T[m], 0 <= m < k, still contain v[0]*...*v[m] */
      
      l = k - 1;

      for (i = n; i-- > 0; ) /* Go through the R[i] again, backwards */
        if (mpz_tstbit (q[i], t))
          {
            if (mpz_tstbit (flag, i))
              {
                /* T[k] contains 1/(v[0]*...*v[l]) */
                if (l > 0) /* need to separate the values */
                  {
                    /* T[l - 1] has v[0]*...*v[l-1] */
                    mpres_mul (T[l], T[l - 1], T[k], modulus); /* So T[l] now has 1/v[l] == 1/(s.x - R[i].x) */
                    mpres_sub (u, s.x, R[i].x, modulus);
                    mpres_mul (T[k], T[k], u, modulus);        /* T[k] now has 1/(v[0]*...*v[l - 1]) */
                  }
                else
                  {
                    /* T[k] contains 1/v[0] */
                    mpres_set (T[0], T[k], modulus); 
                  }
                
                /* 1/(s.x - R[i].x) is in T[l] */
#ifdef WANT_ASSERT
                mpres_sub (u, s.x, R[i].x, modulus);
                mpres_mul (u, u, T[l], modulus);
		mpz_init(__dummy);
                mpres_get_z (__dummy, u, modulus);
                mpz_mod (__dummy, __dummy, modulus->orig_modulus);
                if (mpz_cmp_ui (__dummy, 1) != 0) 
                  outputf (OUTPUT_ERROR, "Error, (s.x - R[%d].x) * T[%d] == "
                           "%Zd\n", i, l, __dummy);
		mpz_clear(__dummy);
#endif
                
                mpres_sub (u, s.y, R[i].y, modulus);   /* U    = y2 - y1 */
                mpres_mul (T[l], T[l], u, modulus);    /* T[l] = (y2-y1)/(x2-x1) = lambda */
                mpres_mul (u, T[l], T[l], modulus);    /* U    = lambda^2 */
                mpres_sub (u, u, R[i].x, modulus);     /* U    = lambda^2 - x1 */
                mpres_sub (R[i].x, u, s.x, modulus);   /* x3   = lambda^2 - x1 - x2 */
                mpres_sub (u, s.x, R[i].x, modulus);   /* U    = x2 - x3 */
                mpres_mul (u, u, T[l], modulus);       /* U    = lambda*(x2 - x3) */
                mpres_sub (R[i].y, u, s.y, modulus);   /* y3   = lambda*(x2 - x3) - y2 */
                muls += 3;
                l--;
              }
            else /* R[i] does not contain a partial result. */
              {
                mpres_set (R[i].x, s.x, modulus);   /* Just set R[i] to s */
                mpres_set (R[i].y, s.y, modulus);
                mpz_setbit (flag, i);               /* and flag it as used */
              }
          }
      
      if (t < maxbit) /* Double s */
        { 
          ASSERT(l==0);
#ifdef WANT_ASSERT
          mpres_add (u, s.y, s.y, modulus);
          mpres_mul (u, u, T[k], modulus);
	  mpz_init(__dummy);
          mpres_get_z (__dummy, u, modulus);
          mpz_mod (__dummy, __dummy, modulus->orig_modulus);
          if (mpz_cmp_ui (__dummy, 1) != 0)
            outputf (OUTPUT_ERROR, "Error, at t==%d, 2*s.y / (2*s.y) == %Zd\n", 
                     t, __dummy);
	  mpz_clear(__dummy);
#endif          

                                               /* 1/(2*s.y) is in T[k] */
          mpres_mul (u, s.x, s.x, modulus);    /* U = X^2 */
          mpres_mul_ui (u, u, 3, modulus);     /* U = 3*X^2 */
          mpres_add (u, u, S->A, modulus);     /* U = 3*X^2 + A */
          mpres_mul (T[k], T[k], u, modulus);  /* T = (3*X^2 + A) / (2*Y) = lambda */
          mpres_mul (u, T[k], T[k], modulus);  /* U = lambda^2 */
          mpres_sub (u, u, s.x, modulus);      /* U = lambda^2 - X */
          mpres_sub (u, u, s.x, modulus);      /* U = lambda^2 - 2*X = s.x' */
          mpres_sub (v, s.x, u, modulus);      /* V = s.x - s.x' */
          mpres_mul (v, v, T[k], modulus);     /* V = lambda*(s.x - s.x') */
          mpres_sub (s.y, v, s.y, modulus);    /* s.y' = lambda*(s.x - s.x') - s.y */
          mpres_set (s.x, u, modulus);
          muls += 4;
        }
    }

  mpres_clear (s.y, modulus);
  mpres_clear (s.x, modulus);
  mpz_clear (flag);

  if (tot_muls != NULL)
    *tot_muls += muls;
  if (tot_gcds != NULL)
    *tot_gcds += gcds;
  
  /* Now take inverse points (negative y-coordinate) where q[i] was < 0 */
  for (i = 0; i < n; i++)
    if (mpz_tstbit (signs, i))
      {
        mpz_neg (R[i].y, R[i].y);
        mpz_neg (q[i], q[i]);
      }

  mpz_clear (signs);
  return youpi;
}


/* Input: Points X[0]..X[(n+1)*m-1]
   T is used for temporary values and needs to have (n-1)*m+1 entries.

   Performs the following loop with only one gcdext, using Montgomery's trick:
   for (i=0;i<m;i++)
     for (j=0;j<n;j++)
         (x[j+(n+1)*i] : y[j+(n+1)*i]) += (x[j+1+(n+1)*i] : y[j+1+(n+1)*i])

   Uses one inversion and 6*n*m-3 multiplications for n*m > 0.
   Processes neutral (zero), identical and negative points correctly.

   Return factor found or not (no error can occur here).
*/

static int
addWnm (mpz_t p, point *X, curve *S, mpmod_t modulus, unsigned int m, 
        unsigned int n, mpres_t *T, unsigned long *tot_muls, 
        unsigned long *tot_gcds)
{
  unsigned int k, l;
  int i, j;

  if (n == 0 || m == 0)
    return ECM_NO_FACTOR_FOUND;

  k = 0;
  for (i = m - 1; i >= 0; i--)    /* Go through the m different lists */
    for (j = n - 1; j >= 0; j--)  /* Go through each list backwards */
      {                           /* And prepare the values to be inverted */
        point *X1, *X2;
        X1 = X + i * (n + 1) + j;
        X2 = X + i * (n + 1) + j + 1;
        
        /* If either element is the neutral element, nothing tbd here */
        if ((mpres_is_zero (X1->x, modulus) && mpres_is_zero (X1->y, modulus)) ||
            (mpres_is_zero (X2->x, modulus) && mpres_is_zero (X2->y, modulus)))
          continue;
        
        mpres_sub (T[k], X2->x, X1->x, modulus); /* Schedule X2.x - X1.x */

        if (mpres_is_zero (T[k], modulus))  /* If both x-cordinates are identical */
          {
            /* Are the points identical? Compare y coordinates: */
            mpres_sub (T[k], X2->y, X1->y, modulus);
            if (mpres_is_zero (T[k], modulus))
              {
                /* Yes, we need to double. Schedule 2*X[...].y */
                mpres_add (T[k], X1->y, X1->y, modulus); 
              }
            else /* No, they are inverses. Nothing tbd here */
              {
#ifdef WANT_ASSERT
                /* Check that the y coordinates are mutual negatives */
                mpres_add (T[k], X2->y, X1->y, modulus);
                ASSERT (mpres_is_zero (T[k], modulus));
#endif
                continue; 
              }
          }

        if (k > 0)
          mpres_mul (T[k], T[k], T[k - 1], modulus);
        k++;
      }

  /* v_m = X[i * (n + 1) + j] - X[i * (n + 1) + j + 1], 0 <= j < n,
     and m = i * n + j */
  /* Here T[m] = v_0 * ... * v_m, 0 <= m < k */

  if (k > 0 && !mpres_invert (T[k], T[k - 1], modulus))
    {
      mpres_gcd (p, T[k - 1], modulus);
      (*tot_muls) += m * n - 1;
      (*tot_gcds) ++;
      return ECM_FACTOR_FOUND_STEP2;
    }

  /* T[k] = 1/(v_0 * ... * v_m), 0 <= m < k */

  l = k - 1;

  for (i = 0; (unsigned) i < m; i++)
    for (j = 0; (unsigned) j < n; j++)
      {
        point *X1, *X2;
        X1 = X + i * (n + 1) + j;
        X2 = X + i * (n + 1) + j + 1;
        
        /* Is X1 the neutral element? */
        if (mpres_is_zero (X1->x, modulus) && mpres_is_zero (X1->y, modulus))
          {
            /* Yes, set X1 to X2 */
            mpres_set (X1->x, X2->x, modulus);
            mpres_set (X1->y, X2->y, modulus);
            continue;
          }
        
        /* Is X2 the neutral element? If so, X1 stays the same */
        if (mpres_is_zero (X2->x, modulus) && mpres_is_zero (X2->y, modulus))
          continue;
        
        /* Are the x-coordinates identical? */
        mpres_sub (T[k + 1], X2->x, X1->x, modulus);
        if (mpres_is_zero (T[k + 1], modulus))
          {
            /* Are the points inverses of each other? */
            mpres_sub (T[k + 1], X2->y, X1->y, modulus);
            if (!mpres_is_zero (T[k + 1], modulus))
              {
                /* Yes. Set X1 to neutral element */
                mpres_set_ui (X1->x, 0, modulus);
                mpres_set_ui (X1->y, 0, modulus);
                continue;
              }
            /* No, we need to double. Restore T[k+1] */
            mpres_sub (T[k + 1], X2->x, X1->x, modulus);
          }

        if (l == 0)
          mpz_set (T[0], T[k]);
        else
          mpres_mul (T[l], T[k], T[l - 1], modulus); 
          /* T_l = 1/(v_0 * ... * v_l) * (v_0 * ... * v_{l-1}) = 1/v_l */


        if (mpres_is_zero (T[k + 1], modulus)) /* Identical points, so double X1 */
          {
            if (l > 0)
              {
                mpres_add (T[k + 1], X1->y, X1->y, modulus); /* T[k+1] = v_{l} */
                mpres_mul (T[k], T[k], T[k + 1], modulus);
                /* T_k = 1/(v_0 * ... * v_l) * v_l = 1/(v_0 * ... * v_{l-1}) */
              }
            
            mpres_mul (T[k + 1], X1->x, X1->x, modulus);
            mpres_mul_ui (T[k + 1], T[k + 1], 3, modulus);
            mpres_add (T[k + 1], T[k + 1], S->A, modulus);
            mpres_mul (T[l], T[k + 1], T[l], modulus); /* T[l] = lambda */
            mpres_mul (T[k + 1], T[l], T[l], modulus);       /* T1   = lambda^2 */
            mpres_sub (T[k + 1], T[k + 1], X1->x, modulus);  /* T1   = lambda^2 - x1 */
            mpres_sub (X1->x, T[k + 1], X2->x, modulus);     /* X1.x = lambda^2 - x1 - x2 = x3 */
            mpres_sub (T[k + 1], X2->x, X1->x, modulus);     /* T1   = x2 - x3 */
            mpres_mul (T[k + 1], T[k + 1], T[l], modulus);   /* T1   = lambda*(x2 - x3) */
            mpres_sub (X1->y, T[k + 1], X2->y, modulus);     /* Y1   = lambda*(x2 - x3) - y2 = y3 */
          }
        else
          {
            if (l > 0)
              {
                mpres_mul (T[k], T[k], T[k + 1], modulus);
                /* T_k = 1/(v_0 * ... * v_l) * v_l = 1/(v_0 * ... * v_{l-1}) */
              }

            mpres_sub (T[k + 1], X2->y, X1->y, modulus);     /* T1   = y2 - y1 */
            mpres_mul (T[l], T[l], T[k + 1], modulus);       /* Tl   = (y2 - y1) / (x2 - x1) = lambda */
            mpres_mul (T[k + 1], T[l], T[l], modulus);       /* T1   = lambda^2 */
            mpres_sub (T[k + 1], T[k + 1], X1->x, modulus);  /* T1   = lambda^2 - x1 */
            mpres_sub (X1->x, T[k + 1], X2->x, modulus);     /* X1.x = lambda^2 - x1 - x2 = x3 */
            mpres_sub (T[k + 1], X2->x, X1->x, modulus);     /* T1   = x2 - x3 */
            mpres_mul (T[k + 1], T[k + 1], T[l], modulus);   /* T1   = lambda*(x2 - x3) */
            mpres_sub (X1->y, T[k + 1], X2->y, modulus);     /* Y1   = lambda*(x2 - x3) - y2 = y3 */
          }
        
        l--;
      }

  if (tot_muls != NULL)
    (*tot_muls) += 6 * m * n - 3;
  if (tot_gcds != NULL)
    (*tot_gcds) ++;

  return ECM_NO_FACTOR_FOUND;
}

/* puts in F[0..dF-1] the successive values of 

   Dickson_{S, a} (j * d2) * s  where s is a point on the elliptic curve

   for j == 1 mod 6, j and d1 coprime.
   Returns non-zero iff a factor was found (then stored in f)
   or an error occurred.
*/

#ifdef MONT_ROOTS
static int
ecm_rootsF_Mont (listz_t F, root_params_t *root_params, 
		 unsigned long dF, curve *s, mpmod_t modulus)
{
  unsigned long i, n;
  unsigned long muls = 0;
  long st;
  mpres_t x6, z6, xn_6, zn_6, xn, zn; /* 6*d2*X, (n-6)*d2*X and n*d2*X, resp. */
  mpres_t b, u, v, w;
  mpz_t t;

  if (dF == 0)
    return ECM_NO_FACTOR_FOUND;

  st = cputime ();

  MPZ_INIT (t);

  mpres_init (b, modulus);
  mpres_add_ui (b, s->A, 2, modulus);
  mpres_div_2exp (b, b, 2, modulus);

  /* We init n to 7, so n-6 == 1 */
  mpres_init (xn_6, modulus);
  mpres_init (zn_6, modulus);
  mpres_set (xn_6, s->x, modulus);
  mpres_set (zn_6, s->y, modulus);
  mpz_set_ui (t, root_params->d2);
  ecm_mul (xn_6, zn_6, t, modulus, b);
  /* (xn_6::zn_6) = d2*X */

  mpres_init (x6, modulus);
  mpres_init (z6, modulus);
  mpres_set (x6, xn_6, modulus);
  mpres_set (z6, zn_6, modulus);
  mpz_set_ui (t, 6);
  ecm_mul (x6, z6, t, modulus, b);
  /* (x6::z6) = 6*d2*X */

  mpres_init (xn, modulus);
  mpres_init (zn, modulus);
  mpres_set (xn, xn_6, modulus);
  mpres_set (zn, zn_6, modulus);
  mpz_set_ui (t, 7);
  ecm_mul (xn, zn, t, modulus, b);
  /* xn :: zn = 7*d2*X */

  outputf (OUTPUT_TRACE, "ecm_rootsF_Mont: storing d2*X in F[0]\n");
  mpres_get_z (F[0], xn_6, modulus);
  /* F[0] = (d2*X)_x */

  mpres_init (u, modulus);
  mpres_init (v, modulus);
  mpres_init (w, modulus);

  n = 7;

  for (i = 1; i < dF; )
    {
      if (gcd (n, root_params->d1) == 1)
	{
	  outputf (OUTPUT_TRACE, 
		   "ecm_rootsF_Mont: storing d2*%lu*X in F[%lu]\n",
		   n, i);
	  mpres_get_z (F[i++], xn, modulus);
	  /* Normalize coordinate to zn=1 */
	  mpres_get_z (t, zn, modulus);
	  mpz_invert (t, t, modulus->orig_modulus);
	  mpz_mul (t, t, F[i++]);
	  mpz_mod (F[i++], t, modulus->orig_modulus);
	}
      
      add3 (xn_6, zn_6, xn, zn, x6, z6, xn_6, zn_6, 
	    modulus, u, v, w);
      mpres_swap (xn_6, xn, modulus);
      mpres_swap (zn_6, zn, modulus);
      n += 6; 
      /* Now (xn::zn) = j*d2*X and (xn_6::zn_6) = (j-6)*d2*X again */
      muls += 6;
    }

  outputf (OUTPUT_VERBOSE, "Computing roots of F took %ldms",
	   elltime (st, cputime ()));
  outputf (OUTPUT_DEVVERBOSE, ", and %ld muls", muls);
  outputf (OUTPUT_VERBOSE, "\n");

  mpres_clear (x6, modulus);
  mpres_clear (z6, modulus);
  mpres_clear (xn_6, modulus);
  mpres_clear (zn_6, modulus);
  mpres_clear (xn, modulus);
  mpres_clear (zn, modulus);
  mpres_clear (b, modulus);
  mpres_clear (u, modulus);
  mpres_clear (v, modulus);
  mpres_clear (w, modulus);
  mpz_clear (t);

  return ECM_NO_FACTOR_FOUND;
}
#endif

int
ecm_rootsF (mpz_t f, listz_t F, root_params_t *root_params, 
            unsigned long dF, curve *s, mpmod_t modulus)
{
  unsigned long i;
  unsigned long muls = 0, gcds = 0;
  long st;
  int youpi = ECM_NO_FACTOR_FOUND;
  listz_t coeffs;
  ecm_roots_state state;
  mpz_t t;
  
  if (dF == 0)
    return ECM_NO_FACTOR_FOUND;

  st = cputime ();

#ifdef MONT_ROOTS
  if (root_params->S == 1)
    return ecm_rootsF_Mont (F, root_params, dF, s, modulus);
#endif

  /* Relative cost of point add during init and computing roots assumed =1 */
  init_roots_state (&state, root_params->S, root_params->d1, root_params->d2, 
                    1.0);

  outputf (OUTPUT_DEVVERBOSE,
	   "ecm_rootsF: state: nr = %d, dsieve = %d, size_fd = %d, S = %d, "
	   "dickson_a = %d\n", state.nr, state.dsieve, state.size_fd, 
	   state.S, state.dickson_a);

  /* Init finite differences tables */
  MPZ_INIT (t); /* t = 0 */
  coeffs = init_progression_coeffs (t, state.dsieve, root_params->d2, 1, 6, 
                                    state.S, state.dickson_a);
  mpz_clear (t);

  if (coeffs == NULL) /* error */
    {
      youpi = ECM_ERROR;
      goto clear;
    }

  /* The highest coefficient is the same for all progressions, so set them
     to one for all but the first progression, later we copy the point.
     FIXME: can we avoid the multiplication of those points in multiplyW2n()
     below?
  */
  for (i = state.S + 1; i < state.size_fd; i += state.S + 1)
    mpz_set_ui (coeffs[i + state.S], 1);

  /* Allocate memory for fd[] and T[] */

  state.fd = (point *) malloc (state.size_fd * sizeof (point));
  if (state.fd == NULL)
    {
      youpi = ECM_ERROR;
      goto exit_ecm_rootsF;
    }
  for (i = 0; i < state.size_fd; i++)
    {
      outputf (OUTPUT_TRACE, "ecm_rootsF: coeffs[%d] = %Zd\n", i, coeffs[i]);
      MEMORY_TAG;
      mpres_init (state.fd[i].x, modulus);
      MEMORY_TAG;
      mpres_init (state.fd[i].y, modulus);
      MEMORY_UNTAG;
    }

  state.T = (mpres_t *) malloc ((state.size_fd + 4) * sizeof (mpres_t));
  if (state.T == NULL)
    {
      youpi = ECM_ERROR;
      goto ecm_rootsF_clearfdi;
    }
  for (i = 0 ; i < state.size_fd + 4; i++)
    {
      MEMORY_TAG;
      mpres_init (state.T[i], modulus);
      MEMORY_UNTAG;
    }

  /* Multiply fd[] = s * coeffs[] */

  youpi = multiplyW2n (f, state.fd, s, coeffs, state.size_fd, modulus,
                       state.T[0], state.T[1], state.T + 2, &muls, &gcds);
  if (youpi == ECM_FACTOR_FOUND_STEP2)
    outputf (OUTPUT_VERBOSE, "Found factor while computing coeff[] * X\n");  

  if (youpi == ECM_ERROR)
    goto clear;

  /* Copy the point corresponding to the highest coefficient of the first 
     progression to the other progressions */
  for (i = state.S + 1; i < state.size_fd; i += state.S + 1)
    {
      mpres_set (state.fd[i + state.S].x, state.fd[state.S].x, modulus);
      mpres_set (state.fd[i + state.S].y, state.fd[state.S].y, modulus);
    }

  clear_list (coeffs, state.size_fd);
  coeffs = NULL;

  if (test_verbose (OUTPUT_VERBOSE))
    {
      unsigned int st1 = cputime ();
      outputf (OUTPUT_VERBOSE,
	       "Initializing tables of differences for F took %ldms",
	       elltime (st, st1));
      outputf (OUTPUT_DEVVERBOSE, ", %lu muls and %lu extgcds", muls, gcds);
      outputf (OUTPUT_VERBOSE, "\n");
      st = st1;
      muls = 0;
      gcds = 0;
    }

  /* Now for the actual calculation of the roots. */

  for (i = 0; i < dF && !youpi;)
    {
      /* Is this a rsieve value where we computed Dickson(j * d2) * X? */
      if (gcd (state.rsieve, state.dsieve) == 1) 
        {
          /* Did we use every progression since the last update? */
          if (state.next == state.nr)
            {
              /* Yes, time to update again */
              youpi = addWnm (f, state.fd, s, modulus, state.nr, state.S, 
                              state.T, &muls, &gcds);
	      ASSERT(youpi != ECM_ERROR); /* no error can occur in addWnm */
              state.next = 0;
              if (youpi == ECM_FACTOR_FOUND_STEP2)
                outputf (OUTPUT_VERBOSE,
			 "Found factor while computing roots of F\n");
            }
          
          /* Is this a j value where we want Dickson(j * d2) * X as a root? */
          if (gcd (state.rsieve, root_params->d1) == 1) 
            mpres_get_z (F[i++], state.fd[state.next * (state.S + 1)].x, 
                         modulus);

          state.next ++;
        }
      state.rsieve += 6;
    }

 clear:
  for (i = 0 ; i < state.size_fd + 4; i++)
    mpres_clear (state.T[i], modulus);
  free (state.T);

 ecm_rootsF_clearfdi:
  for (i = 0; i < state.size_fd; i++)
    {
      mpres_clear (state.fd[i].x, modulus);
      mpres_clear (state.fd[i].y, modulus);
    }
  free (state.fd);

 exit_ecm_rootsF:
  if (youpi)
    return youpi; /* error or factor found */
  
  outputf (OUTPUT_VERBOSE, "Computing roots of F took %ldms",
	   elltime (st, cputime ()));
  outputf (OUTPUT_DEVVERBOSE, ", %ld muls and %ld extgcds", muls, gcds);
  outputf (OUTPUT_VERBOSE, "\n");

  return ECM_NO_FACTOR_FOUND;
}

/* Perform the necessary initialization to allow computation of
   
     Dickson_{S, a}(s+n*d) * P , where P is a point on the elliptic curve
   
   for successive n, where Dickson_{S, a} is the degree S Dickson
   polynomial with parameter a. For a == 0, Dickson_{S, a} (x) = x^S.
   
   If a factor is found during the initialisation, NULL is returned and the
   factor in f. If an error occurred, NULL is returned and f is -1.
*/

#ifdef MONT_ROOTS
static ecm_roots_state *
ecm_rootsG_Mont_init (curve *X, root_params_t *root_params, 
		      unsigned long dF, unsigned long blocks, mpmod_t modulus)
{
  unsigned long i;
  unsigned long muls;
  ecm_roots_state *state;
  long st;
  mpz_t t;

  outputf (OUTPUT_DEVVERBOSE,
           "ecm_rootsG_Mont_init: dF = %lu, k = %lu, root_params: d1 = %lu, "
           "d2 = %lu, S = %i, i0 = %Zd\n",
           dF, blocks, root_params->d1, root_params->d2, root_params->S,
           root_params->i0);

  st = cputime ();
  
  state = (ecm_roots_state *) malloc (sizeof (ecm_roots_state));
  if (state == NULL)
    return NULL;
    
  state->size_fd = 3;
  state->nr = 1;
  state->next = 0;
  state->S = 1;
  state->dsieve = root_params->d2;
  state->rsieve = mpz_tdiv_ui (root_params->i0, root_params->d2);
  state->dickson_a = 0;
  /* we'll need to know which roots are {0, 1, 2}*d2*X in ecm_rootsG() */
  if (mpz_fits_slong_p (root_params->i0))
    state->i0 = mpz_get_si (root_params->i0);
  else
    state->i0 = 2;
#ifdef MONT_ROOTS
  state->form = EC_MONTGOMERY_FORM;
#endif
  state->fd = malloc (state->size_fd * sizeof (point));
  if (state->fd == NULL)
    return NULL;
  state->size_T = 4;
  state->T = malloc (state->size_T * sizeof (mpres_t));
  if (state->T == NULL)
    {
      free (state->fd);
      return NULL;;
    }
  state->X = X;
  
  for (i = 0; i < state->size_fd; i++)
    {
      MEMORY_TAG;
      mpres_init (state->fd[i].x, modulus);
      MEMORY_TAG;
      mpres_init (state->fd[i].y, modulus);
      MEMORY_UNTAG;
    }

  for (i = 0; i < state->size_T; i++)
    {
      MEMORY_TAG;
      mpres_init (state->T[i], modulus);
      MEMORY_UNTAG;
    }

  MPZ_INIT (t);
  
  mpres_add_ui (state->T[0], X->A, 2, modulus);
  mpres_div_2exp (state->T[0], state->T[0], 2, modulus);
  /* state->T[0] = b, ...T[1..3] = temp space for add3() */
  
  outputf (OUTPUT_TRACE, "ecm_rootsG_Mont_init: X = (%Zd::%Zd)\n",
           X->x, X->y);
  
  mpres_set (state->fd[2].x, X->x, modulus);
  mpres_set (state->fd[2].y, X->y, modulus);
  mpz_set_ui (t, root_params->d1);
  ecm_mul (state->fd[2].x, state->fd[2].y, t, modulus, state->T[0]);
  /* fd[2] = d1*X */
  outputf (OUTPUT_TRACE, "ecm_rootsG_Mont_init: d1*X = (%Zd::%Zd)\n",
           state->fd[2].x, state->fd[2].y);
  
  mpres_set (state->fd[0].x, state->fd[2].x, modulus);
  mpres_set (state->fd[0].y, state->fd[2].y, modulus);
  ecm_mul (state->fd[0].x, state->fd[0].y, root_params->i0, modulus, 
	   state->T[0]);
  /* fd[0] = i0*d1*X */
  outputf (OUTPUT_TRACE, "ecm_rootsG_Mont_init: i0*d1*X = (%Zd::%Zd)\n",
           state->fd[0].x, state->fd[0].y);

  mpres_set (state->fd[1].x, state->fd[2].x, modulus);
  mpres_set (state->fd[1].y, state->fd[2].y, modulus);
  mpz_sub_ui (t, root_params->i0, 1);
  ecm_mul (state->fd[1].x, state->fd[1].y, t, modulus, state->T[0]);
  /* fd[1] = (i0-1)*d1*X */
  outputf (OUTPUT_TRACE, "ecm_rootsG_Mont_init: (i0-1)*d1*X = (%Zd::%Zd)\n",
           state->fd[1].x, state->fd[1].y);
  
  st = elltime (st, cputime ());
  outputf (OUTPUT_VERBOSE,
	   "Initializing table of differences for G took %ldms\n", st);
  
  mpz_clear (t);

  return state;
}
#endif

ecm_roots_state *
ecm_rootsG_init (mpz_t f, curve *X, root_params_t *root_params, 
                 unsigned long dF, unsigned long blocks, mpmod_t modulus)
{
  unsigned int k, phid2;
  unsigned long muls = 0, gcds = 0;
  listz_t coeffs;
  ecm_roots_state *state;
  int youpi = 0;
  int dickson_a;
  unsigned int T_inv;
  double bestnr;
  long st = 0;

  ASSERT (gcd (root_params->d1, root_params->d2) == 1);

#ifdef MONT_ROOTS
  if (root_params->S == 1)
    return ecm_rootsG_Mont_init (X, root_params, dF, blocks, modulus);
#endif

  if (test_verbose (OUTPUT_VERBOSE))
    st = cputime ();
  
  state = (ecm_roots_state *) malloc (sizeof (ecm_roots_state));
  if (state == NULL)
    {
      mpz_set_si (f, -1);
      return NULL;
    }

  /* If S < 0, use degree |S| Dickson poly, otherwise use x^S */
  dickson_a = (root_params->S < 0) ? -1 : 0;
  state->S = abs (root_params->S);

  /* Estimate the cost of a modular inversion (in unit of time per 
     modular multiplication) */
  if (modulus->repr == ECM_MOD_BASE2)
    T_inv = 18;
  else
    T_inv = 6;
  
  /* Guesstimate a value for the number of disjoint progressions to use */
  bestnr = -(4. + T_inv) + sqrt(12. * (double) dF * (double) blocks * 
        (T_inv - 3.) * log (2. * root_params->d1) / log (2.) - (4. + T_inv) * 
        (4. + T_inv));
  bestnr /= 6. * (double) (state->S) * log (2. * root_params->d1) / log (2.0);
  
  outputf (OUTPUT_TRACE, "ecm_rootsG_init: bestnr = %f\n", bestnr);
  
  if (bestnr < 1.)
    state->nr = 1;
  else
    state->nr = (unsigned int) (bestnr + .5);

  phid2 = eulerphi (root_params->d2);

  /* Round up state->nr to multiple of eulerphi(d2) */
  if (phid2 > 1)
    state->nr = ((state->nr + (phid2 - 1)) / phid2) * phid2;

  state->size_fd = state->nr * (state->S + 1);

  outputf (OUTPUT_DEVVERBOSE, "ecm_rootsG_init: i0=%Zd, d1=%lu, d2=%lu, "
           "dF=%lu, blocks=%lu, S=%u, T_inv = %d, nr=%d\n", root_params->i0, 
           root_params->d1, root_params->d2, dF, blocks, state->S, T_inv, 
           state->nr);
  
  state->X = X;
  state->next = 0;
  state->dsieve = 1; /* We only init progressions coprime to d2, so nothing to be skipped */
  state->rsieve = 0;
#ifdef MONT_ROOTS
  state->form = EC_WEIERSTRASS_FORM;
#endif

  coeffs = init_progression_coeffs (root_params->i0, root_params->d2, 
           root_params->d1, state->nr / phid2, 1, state->S, dickson_a);

  if (coeffs == NULL) /* error */
    {
      free (state);
      mpz_set_si (f, -1);
      return NULL;
    }

  state->fd = (point *) malloc (state->size_fd * sizeof (point));
  if (state->fd == NULL)
    {
      clear_list (coeffs, state->size_fd);
      free (state);
      mpz_set_si (f, -1);
      return NULL;
    }
  for (k = 0; k < state->size_fd; k++)
    {
      MEMORY_TAG;
      mpres_init (state->fd[k].x, modulus);
      MEMORY_TAG;
      mpres_init (state->fd[k].y, modulus);
      MEMORY_UNTAG;
    }
  
  state->size_T = state->size_fd + 4;
  state->T = (mpres_t *) malloc (state->size_T * sizeof (mpres_t));
  if (state->T == NULL)
    {
      for (k = 0; k < state->size_fd; k++)
        {
          mpres_clear (state->fd[k].x, modulus);
          mpres_clear (state->fd[k].y, modulus);
        }
      clear_list (coeffs, state->size_fd);
      free (state);
      mpz_set_si (f, -1);
      return NULL;
    }
  for (k = 0; k < state->size_T; k++)
    {
      MEMORY_TAG;
      mpres_init (state->T[k], modulus);
      MEMORY_UNTAG;
    }

  for (k = state->S + 1; k < state->size_fd; k += state->S + 1)
     mpz_set_ui (coeffs[k + state->S], 1);

  if (test_verbose (OUTPUT_TRACE))
    for (k = 0; k < state->size_fd; k++)
      outputf (OUTPUT_TRACE, "ecm_rootsG_init: coeffs[%d] == %Zd\n", 
               k, coeffs[k]);

  youpi = multiplyW2n (f, state->fd, X, coeffs, state->size_fd, modulus, 
                     state->T[0], state->T[1], state->T + 2, &muls, &gcds);
  if (youpi == ECM_ERROR)
    mpz_set_si (f, -1); /* fall through */

  for (k = state->S + 1; k < state->size_fd; k += state->S + 1)
    {
      mpres_set (state->fd[k + state->S].x, state->fd[state->S].x, modulus);
      mpres_set (state->fd[k + state->S].y, state->fd[state->S].y, modulus);
    }
  
  clear_list (coeffs, state->size_fd);
  coeffs = NULL;
  
  if (youpi != ECM_NO_FACTOR_FOUND) /* factor found or error */
    {
      if (youpi == ECM_FACTOR_FOUND_STEP2)
        outputf (OUTPUT_VERBOSE, "Found factor while computing fd[]\n");

      ecm_rootsG_clear (state, modulus);
      
      /* Signal that a factor was found, or an error occurred (f=-1) */
      state = NULL;
    }
  else
    {
      if (test_verbose (OUTPUT_VERBOSE))
        {
          st = elltime (st, cputime ());
          outputf (OUTPUT_VERBOSE,
		   "Initializing table of differences for G took %ldms", st);
	  outputf (OUTPUT_DEVVERBOSE, ", %lu muls and %lu extgcds",
		   muls, gcds);
          outputf (OUTPUT_VERBOSE, "\n");
        }
    }
  
  return state;
}

void 
ecm_rootsG_clear (ecm_roots_state *state, ATTRIBUTE_UNUSED mpmod_t modulus)
{
  unsigned int k;
  
  for (k = 0; k < state->size_fd; k++)
    {
      mpres_clear (state->fd[k].x, modulus);
      mpres_clear (state->fd[k].y, modulus);
    }
  free (state->fd);
  
  for (k = 0; k < state->size_T; k++)
    mpres_clear (state->T[k], modulus);
  free (state->T);
  
  free (state);
}

/* Puts in G the successive values of

     Dickson_{S, a}(s+j*k) P
    
     where P is a point on the elliptic curve,
     0<= j <= dF-1, k is the 'd' value from ecm_rootsG_init()
     and s is the 's' value of ecm_rootsG_init() or where a previous
     call to ecm_rootsG has left off.

   Returns non-zero iff a factor was found (then stored in f).
   Cannot return an error.
*/

int 
ecm_rootsG (mpz_t f, listz_t G, unsigned long dF, ecm_roots_state *state, 
            mpmod_t modulus)
{
  unsigned long i;
  unsigned long muls = 0, gcds = 0;
  int youpi = ECM_NO_FACTOR_FOUND;
  long st;
  point *fd = state->fd; /* to save typing */
  
  st = cputime ();

  outputf (OUTPUT_TRACE, "ecm_rootsG: dF = %lu, state: nr = %u, next = %u, "
           "S = %u, dsieve = %u, rsieve = %u,\n\tdickson_a = %d\n", 
           dF, state->nr, state->next, state->S, state->dsieve, 
           state->rsieve, state->dickson_a);
#ifdef MONT_ROOTS
  outputf (OUTPUT_TRACE, "form = %s, i0 = %ld\n", 
           (state->form == EC_WEIERSTRASS_FORM) ? 
           "Weierstrass" : "Montgomery", state->i0);
#endif
  
  for (i = 0; i < dF;)
    {
      /* Did we use every progression since the last update? */
      if (state->next == state->nr)
        {
          /* Yes, time to update again */
#ifdef MONT_ROOTS
	  if (state->form == EC_MONTGOMERY_FORM) /* If in Montgomery form */
	    {
              mpres_swap (fd[0].x, fd[1].x, modulus);
              mpres_swap (fd[0].y, fd[1].y, modulus);
	      if (state->i0 == -1)
	        {
	          /* New point is 0*d1*X */
	          mpres_set_ui (fd[0].x, 0, modulus);
	          mpres_set_ui (fd[0].y, 0, modulus);
	        }
	      else if (state->i0 == 0)
	        {
	          /* Test that old point is (0::0) */
	          ASSERT (mpres_is_zero (fd[1].x, modulus) && 
	                  mpres_is_zero (fd[1].y, modulus));
	          /* New point is 1*d1*X */
	          mpres_set (fd[0].x, fd[2].x, modulus);
	          mpres_set (fd[0].y, fd[2].y, modulus);
	        }
	      else if (state->i0 == 1)
	        {
	          /* New point is 2*d1*X */
#if 0 && defined (WANT_ASSERT)
                  /* Compare that fd[1] and fd[2] are identical */
                  /* FIXME: strictly speaking, we need to normalize first! */
                  mpres_sub (state->T[1], fd[1].x, fd[2].x, modulus);
                  mpres_sub (state->T[2], fd[1].y, fd[2].y, modulus);
                  ASSERT (mpres_is_zero (state->T[1], modulus) && 
                          mpres_is_zero (state->T[2], modulus));
#endif
                  ASSERT (mpres_is_zero (fd[0].x, modulus));
                  ASSERT (mpres_is_zero (fd[0].y, modulus));
	          duplicate (fd[0].x, fd[0].y, fd[1].x, fd[1].y, modulus,
	                     state->T[0], state->T[1], state->T[2], 
	                     state->T[3]);
	        }
	      else 
	        {
	          add3 (fd[0].x, fd[0].y, fd[1].x, fd[1].y, fd[2].x, fd[2].y, 
		        fd[0].x, fd[0].y, modulus, 
		        state->T[1], state->T[2], state->T[3] );
	        }
	      if (state->i0 < 2)
	        state->i0++;
	    }
	  else /* in Weierstrass form */
#endif
	    {
	      youpi = addWnm (f, fd, state->X, modulus, state->nr, 
			      state->S, state->T, &muls, &gcds);
	      ASSERT(youpi != ECM_ERROR); /* no error can occur in addWnm */
	    }
	  state->next = 0;
          
          if (youpi == ECM_FACTOR_FOUND_STEP2)
            {
	      outputf (OUTPUT_VERBOSE, "Found factor while computing G[]\n");
              break;
            }
        }
      
      /* Is this a root we should skip? (Take only if gcd == 1) */
      if (gcd (state->rsieve, state->dsieve) == 1)
	{
#ifdef MONT_ROOTS
	  if (state->form == EC_MONTGOMERY_FORM)
	    {
	      if (state->i0 != 0)
	        {
                  /* Normalize coordinates to z=1 */
                  if (!mpres_invert (state->T[1], fd[0].y, modulus))
                    {
                      mpres_gcd (f, fd[0].y, modulus);
	              outputf (OUTPUT_VERBOSE, 
	                       "Found factor while computing G[%lu]\n", i);
                      return ECM_FACTOR_FOUND_STEP2;
                    }
                  mpres_mul (state->T[1], state->T[1], 
                             (fd + state->next * (state->S + 1))->x,
                             modulus);
                }
              mpres_get_z (G[i++], state->T[1], modulus);
            } 
          else 
#endif
            {
              mpres_get_z (G[i++], 
                           (fd + state->next * (state->S + 1))->x, 
                           modulus);
            }
	  outputf (OUTPUT_TRACE, 
	           "ecm_rootsG: storing d1*%u*X = %Zd in G[%lu]\n",
		   state->rsieve, G[i - 1], i);
	}

      state->next ++;
      state->rsieve ++;
    }
  
  outputf (OUTPUT_VERBOSE, "Computing roots of G took %ldms",
	   elltime (st, cputime ()));
  outputf (OUTPUT_DEVVERBOSE, ", %lu muls and %lu extgcds", muls, gcds);
  outputf (OUTPUT_VERBOSE, "\n");
  
  return youpi;
}


/* Find smallest i >= 0 so that 
   f(j * d2) X = f((i0 + i) * d1) X over GF(p) */
long 
ecm_findmatch (const unsigned long j, root_params_t *root_params, 
               curve *X, mpmod_t n, mpz_t p)
{
  const int dickson_a = root_params->S < 0 ? -1 : 0;
  const unsigned int S = abs (root_params->S);
  const int sizeT = S + 3;
  long i = 0, k;
  point iX, jX;
  curve Xp; /* The point and curve over GF(p) */
  mpmod_t modulus;
  mpz_t s, t; /* temp vars */
  mpres_t u, v; /* temp vars */
  listz_t coeffs;
  point *fd;
  mpres_t *T;
  
  outputf (OUTPUT_RESVERBOSE, "Looking for i so that "
           "f(i*%lu)*X = f(%lu*%lu)*X\n", root_params->d1, 
           j, root_params->d2);
  
  mpmod_init (modulus, p, 0);
  mpz_init (s);
  mpz_init (t);
  mpres_init (u, modulus);
  mpres_init (v, modulus);
  mpres_init (Xp.x, modulus);
  mpres_init (Xp.y, modulus);
  mpres_init (Xp.A, modulus);
  mpres_init (iX.x, modulus);
  mpres_init (iX.y, modulus);
  mpres_init (jX.x, modulus);
  mpres_init (jX.y, modulus);
  T = malloc (sizeT * sizeof (mpres_t));
  if (T == NULL)
    goto clear_and_exit;
  for (k = 0; k < sizeT; k++)
    mpres_init (T[k], modulus);
  fd = malloc ((S + 1) * sizeof (point));
  if (fd == NULL)
    goto clear_T_and_exit;
  for (k = 0; k < S + 1; k++)
    {
      mpres_init (fd[k].x, modulus);
      mpres_init (fd[k].y, modulus);
    }

  /* Copy the parameters of the curve over Z/ZN to the curve over GF(p) */
  mpres_get_z (t, X->x, n);
  mpres_set_z (Xp.x, t, modulus);
  mpres_get_z (t, X->y, n);
  mpres_set_z (Xp.y, t, modulus);
  mpres_get_z (t, X->A, n);
  mpres_set_z (Xp.A, t, modulus);
  
  /* We use init_progression_coeffs() to compute f(j * d2) */
  mpz_set_ui (t, j);
  coeffs = init_progression_coeffs (t, 1UL, root_params->d2, 1U, 1U, S, 
                                    dickson_a);
  if (coeffs == NULL)
    goto clear_fd_and_exit;
  
  /* Now compute f(j * d2) X */
  multiplyW2n (NULL, &jX, &Xp, coeffs, 1U, modulus, u, v, T, NULL, NULL);

  clear_list (coeffs, S + 1);

  /* We'll keep {f(j * d2) X}_x in s */
  mpres_get_z (s, jX.x, modulus);
  outputf (OUTPUT_DEVVERBOSE, "ecm_findmatch: (f(j * d2) X)_x = %Zd\n", s);

  /* Now compute {f((i0 + i) d1) X}_x one at a time and put them in t, 
     until s == t */
  
  /* Init the progression */
  coeffs = init_progression_coeffs (root_params->i0, 1UL, root_params->d1, 
                                    1U, 1U, S, dickson_a);
  if (coeffs == NULL)
    goto clear_fd_and_exit;
  multiplyW2n (NULL, fd, &Xp, coeffs, S + 1, modulus, u, v, T, NULL, NULL);
  clear_list (coeffs, S + 1);
  
  i = 0;
  mpres_get_z (t, fd[0].x, modulus);
  while (mpz_cmp (s, t) != 0)
    {
      addWnm (NULL, fd, &Xp, modulus, 1, S, T, NULL, NULL);
      mpres_get_z (t, fd[0].x, modulus);
      i++;
    }

  outputf (OUTPUT_DEVVERBOSE, "ecm_findmatch: i - i0 = %ld, "
           "{f(i * d1) X}_x = %Zd\n", i, t);

  /* We'll compute f(i * d1) and compare it to f(j * d2) to verify 
     correctness of the result, and to determine whether it was
     f(i * d1)-f(j * d2) or f(i * d1)+f(j * d2) that found the factor */
  /* We use init_progression_coeffs() to compute f(i * d1) */
  mpz_add_ui (t, root_params->i0, i);
  coeffs = init_progression_coeffs (t, 1UL, root_params->d1, 1U, 1U, S, 
                                    dickson_a);
  if (coeffs == NULL)
    goto clear_fd_and_exit;
  
  /* Now compute iX = f(i * d1) X */
  multiplyW2n (NULL, &iX, &Xp, coeffs, 1U, modulus, u, v, T, NULL, NULL);
  
  clear_list (coeffs, S + 1);

  mpres_get_z (t, iX.x, modulus);
  if (mpz_cmp (s, t) != 0)
    {
      outputf (OUTPUT_ERROR, "ecm_findmatch: ERROR, (f(i*d1) X)_x != "
               "(f(j*d2) X)_x\n(f(i*d1) X)_x = %Zd\n", t);
      i = 0;
    }

  mpres_get_z (s, jX.y, modulus);
  mpres_get_z (t, iX.y, modulus);
  if (mpz_cmp (s, t) != 0)
    {
      mpz_sub (t, p, t);
      if (mpz_cmp (s, t) != 0)
        {
          mpz_sub (t, p, t);
          outputf (OUTPUT_ERROR, "ecm_findmatch: ERROR, (f(i*d1) X)_y != "
                   "+-(f(j*d2) X)_y\n");
          outputf (OUTPUT_ERROR, "(f(i*d1) X)_y = %Zd\n", t);
          outputf (OUTPUT_ERROR, "(f(j*d2) X)_y = %Zd\n", s);
        }
      else
        i = -i;
    }

clear_fd_and_exit:
  for (k = 0; k < S + 1; k++)
    {
      mpres_clear (fd[k].x, modulus);
      mpres_clear (fd[k].y, modulus);
    }
  free(fd);
clear_T_and_exit:
  for (k = 0; k < sizeT; k++)
    mpres_clear (T[k], modulus);
  free (T);
clear_and_exit:
  mpz_clear (s);
  mpz_clear (t);
  mpres_clear (u, modulus);
  mpres_clear (v, modulus);
  mpres_clear (Xp.x, modulus);
  mpres_clear (Xp.y, modulus);
  mpres_clear (Xp.A, modulus);
  mpres_clear (iX.x, modulus);
  mpres_clear (iX.y, modulus);
  mpres_clear (jX.x, modulus);
  mpres_clear (jX.y, modulus);
  mpmod_clear (modulus);
  
  return i;
}