/* Polynomial multiplication using GMP's integer multiplication code
Copyright 2004, 2005 Dave Newman, Paul Zimmermann.
This file is part of the ECM Library.
The ECM Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.
The ECM Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the ECM Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111-1307, USA.
*/
#include <stdlib.h>
#include "ecm-gmp.h" /* for MPZ_REALLOC and MPN_COPY */
#include "ecm-impl.h"
#ifdef HAVE___GMPN_MUL_FFT
#define FFT_WRAP
#endif
/* Puts in R[0..2l-2] the product of A[0..l-1] and B[0..l-1].
T must have as much space as for toomcook4 (it is only used when that
function is called).
Notes:
- this code aligns the coeffs at limb boundaries - if instead we aligned
at byte boundaries then we could save up to 3*l bytes in T0 and T1,
but tests have shown this doesn't give any significant speed increase,
even for large degree polynomials.
- this code requires that all coefficients A[] and B[] are nonnegative.
Return non-zero if an error occurred.
*/
int
kronecker_schonhage (listz_t R, listz_t A, listz_t B, unsigned int l,
listz_t T)
{
unsigned long i;
mp_size_t s, t = 0, size_t0, size_tmp;
mp_ptr t0_ptr, t1_ptr, t2_ptr, r_ptr;
s = mpz_sizeinbase (A[0], 2);
if ((double) l * (double) s < KS_MUL_THRESHOLD)
{
toomcook4 (R, A, B, l, T);
return 0;
}
for (i = 0; i < l; i++)
{
if ((s = mpz_sizeinbase (A[i], 2)) > t)
t = s;
if ((s = mpz_sizeinbase (B[i], 2)) > t)
t = s;
}
/* max number of bits in a coeff of T[0] * T[1] will be
2 * t + ceil(log_2(l)) */
s = t * 2;
for (i = l - 1; i; s++, i >>= 1); /* ceil(log_2(l)) = 1+floor(log_2(l-1)) */
/* work out the corresponding number of limbs */
s = 1 + (s - 1) / GMP_NUMB_BITS;
/* Note: s * (l - 1) + ceil(t/GMP_NUMB_BITS) should be faster,
but no significant speedup was observed */
size_t0 = s * l;
/* allocate one double-buffer to save malloc/MPN_ZERO/free calls */
t0_ptr = (mp_ptr) malloc (2 * size_t0 * sizeof (mp_limb_t));
if (t0_ptr == NULL)
return 1;
t1_ptr = t0_ptr + size_t0;
MPN_ZERO (t0_ptr, size_t0 + size_t0);
for (i = 0; i < l; i++)
{
if (SIZ(A[i]))
MPN_COPY (t0_ptr + i * s, PTR(A[i]), SIZ(A[i]));
if (SIZ(B[i]))
MPN_COPY (t1_ptr + i * s, PTR(B[i]), SIZ(B[i]));
}
t2_ptr = (mp_ptr) malloc (2 * size_t0 * sizeof (mp_limb_t));
if (t2_ptr == NULL)
{
free (t0_ptr);
return 1;
}
/* mpn_mul_fft (a, b, c, n) allocates auxiliary memory of about 8n limbs,
thus the total memory allocated by this function is about 12*size_t0.
Since size_t0 is about 2*dF*limbs(modulus), this is about
24*dF*limbs(modulus). */
mpn_mul_n (t2_ptr, t0_ptr, t1_ptr, size_t0);
for (i = 0; i < 2 * l - 1; i++)
{
size_tmp = s;
MPN_NORMALIZE(t2_ptr + i * s, size_tmp);
r_ptr = MPZ_REALLOC (R[i], size_tmp);
if (size_tmp)
MPN_COPY (r_ptr, t2_ptr + i * s, size_tmp);
SIZ(R[i]) = size_tmp;
}
free (t0_ptr);
free (t2_ptr);
return 0;
}
/* Given a[0..m] and c[0..l], puts in b[0..n] the coefficients
of degree m to n+m of rev(a)*c, i.e.
b[0] = a[0]*c[0] + ... + a[i]*c[i] with i = min(m, l)
...
b[k] = a[0]*c[k] + ... + a[i]*c[i+k] with i = min(m, l-k)
...
b[n] = a[0]*c[n] + ... + a[i]*c[i+n] with i = min(m, l-n) [=l-n].
If rev=0, consider a instead of rev(a).
Assumes n <= l.
Return non-zero if an error occurred.
*/
int
TMulKS (listz_t b, unsigned int n, listz_t a, unsigned int m,
listz_t c, unsigned int l, mpz_t modulus, int rev)
{
unsigned long i, s = 0, t, k;
mp_ptr ap, bp, cp;
mp_size_t an, bn, cn;
int ret = 0; /* default return value */
#ifdef DEBUG
long st = cputime ();
fprintf (ECM_STDOUT, "n=%u m=%u l=%u bits=%u n*bits=%u: ", n, m, l,
mpz_sizeinbase (modulus, 2), n * mpz_sizeinbase (modulus, 2));
#endif
ASSERT (n <= l); /* otherwise the upper coefficients of b are 0 */
if (l > n + m)
l = n + m; /* otherwise, c has too many coeffs */
/* compute max bits of a[] and c[] */
for (i = 0; i <= m; i++)
{
if (mpz_sgn (a[i]) < 0)
mpz_mod (a[i], a[i], modulus);
if ((t = mpz_sizeinbase (a[i], 2)) > s)
s = t;
}
for (i = 0; i <= l; i++)
{
if (mpz_sgn (c[i]) < 0)
mpz_mod (c[i], c[i], modulus);
if ((t = mpz_sizeinbase (c[i], 2)) > s)
s = t;
}
#ifdef FFT_WRAP
s ++; /* need one extra bit to determine sign of low(b) - high(b) */
#endif
/* max coeff has 2*s+ceil(log2(max(m+1,l+1))) bits,
i.e. 2*s + 1 + floor(log2(max(m,l))) */
for (s = 2 * s, i = (m > l) ? m : l; i; s++, i >>= 1);
/* corresponding number of limbs */
s = 1 + (s - 1) / GMP_NUMB_BITS;
an = (m + 1) * s;
cn = (l + 1) * s;
bn = an + cn;
/* a[0..m] needs (m+1) * s limbs */
ap = (mp_ptr) malloc (an * sizeof (mp_limb_t));
if (ap == NULL)
{
ret = 1;
goto TMulKS_end;
}
cp = (mp_ptr) malloc (cn * sizeof (mp_limb_t));
if (cp == NULL)
{
ret = 1;
goto TMulKS_free_ap;
}
MPN_ZERO (ap, an);
MPN_ZERO (cp, cn);
/* a is reverted */
for (i = 0; i <= m; i++)
if (SIZ(a[i]))
MPN_COPY (ap + ((rev) ? (m - i) : i) * s, PTR(a[i]), SIZ(a[i]));
for (i = 0; i <= l; i++)
if (SIZ(c[i]))
MPN_COPY (cp + i * s, PTR(c[i]), SIZ(c[i]));
#ifdef FFT_WRAP
/* the product rev(a) * c has m+l+1 coefficients.
We throw away the first m and the last l-n <= m.
If we compute mod (m+n+1) * s limbs, we are ok */
k = mpn_fft_best_k ((m + n + 1) * s, 0);
bn = mpn_fft_next_size ((m + n + 1) * s, k);
bp = (mp_ptr) malloc ((bn + 1) * sizeof (mp_limb_t));
if (bp == NULL)
{
ret = 1;
goto TMulKS_free_cp;
}
mpn_mul_fft (bp, bn, ap, an, cp, cn, k);
if (m && bp[m * s - 1] >> (GMP_NUMB_BITS - 1)) /* lo(b)-hi(b) is negative */
mpn_add_1 (bp + m * s, bp + m * s, (n + 1) * s, (mp_limb_t) 1);
#else
bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t));
if (bp == NULL)
{
ret = 1;
goto TMulKS_free_cp;
}
if (an >= cn)
mpn_mul (bp, ap, an, cp, cn);
else
mpn_mul (bp, cp, cn, ap, an);
#endif
/* recover coefficients of degree m to n+m of product in b[0..n] */
bp += m * s;
for (i = 0; i <= n; i++)
{
t = s;
MPN_NORMALIZE(bp, t);
MPZ_REALLOC (b[i], (mp_size_t) t);
if (t)
MPN_COPY (PTR(b[i]), bp, t);
SIZ(b[i]) = t;
bp += s;
}
bp -= (m + n + 1) * s;
free (bp);
TMulKS_free_cp:
free (cp);
TMulKS_free_ap:
free (ap);
#ifdef DEBUG
fprintf (ECM_STDOUT, "%ums\n", elltime (st, cputime ()));
#endif
TMulKS_end:
return ret;
}
#ifdef DEBUG
void
mpn_print (mp_ptr np, mp_size_t nn)
{
mp_size_t i;
for (i = 0; i < nn; i++)
fprintf (ECM_STDOUT, "+%lu*B^%u", np[i], i);
fprintf (ECM_STDOUT, "\n");
}
#endif
unsigned int
ks_wrapmul_m (unsigned int m0, unsigned int k, mpz_t n)
{
mp_size_t t, s;
unsigned long i, fft_k, m;
t = mpz_sizeinbase (n, 2);
s = t * 2 + 1;
for (i = k - 1; i; s++, i >>= 1);
s = 1 + (s - 1) / GMP_NUMB_BITS;
fft_k = mpn_fft_best_k (m0 * s, 0);
i = mpn_fft_next_size (m0 * s, fft_k);
while (i % s)
i = mpn_fft_next_size (i + 1, fft_k);
m = i / s;
return m;
}
/* multiply in R[] A[0]+A[1]*x+...+A[k-1]*x^(k-1)
by B[0]+B[1]*x+...+B[l-1]*x^(l-1) modulo n,
wrapping around coefficients of the product up from degree m >= m0.
Assumes k >= l.
R is assumed to have 2*m0-3+list_mul_mem(m0-1) allocated cells.
Return m (or 0 if an error occurred).
*/
unsigned int
ks_wrapmul (listz_t R, unsigned int m0,
listz_t A, unsigned int k,
listz_t B, unsigned int l,
mpz_t n)
{
unsigned long i, fft_k, m, t;
mp_size_t s, size_t0, size_t1, size_tmp;
mp_ptr t0_ptr, t1_ptr, t2_ptr, r_ptr, tp;
int negative;
ASSERT(k >= l);
t = mpz_sizeinbase (n, 2);
for (i = 0; i < k; i++)
if (mpz_sgn (A[i]) < 0 || mpz_sizeinbase (A[i], 2) > t)
mpz_mod (A[i], A[i], n);
for (i = 0; i < l; i++)
if (mpz_sgn (B[i]) < 0 || mpz_sizeinbase (B[i], 2) > t)
mpz_mod (B[i], B[i], n);
s = t * 2 + 1; /* one extra sign bit */
for (i = k - 1; i; s++, i >>= 1);
s = 1 + (s - 1) / GMP_NUMB_BITS;
size_t0 = s * k;
size_t1 = s * l;
/* allocate one double-buffer to save malloc/MPN_ZERO/free calls */
t0_ptr = (mp_ptr) malloc (size_t0 * sizeof (mp_limb_t));
if (t0_ptr == NULL)
return 0;
t1_ptr = (mp_ptr) malloc (size_t1 * sizeof (mp_limb_t));
if (t1_ptr == NULL)
{
free (t0_ptr);
return 0;
}
MPN_ZERO (t0_ptr, size_t0);
MPN_ZERO (t1_ptr, size_t1);
for (i = 0; i < k; i++)
if (SIZ(A[i]))
MPN_COPY (t0_ptr + i * s, PTR(A[i]), SIZ(A[i]));
for (i = 0; i < l; i++)
if (SIZ(B[i]))
MPN_COPY (t1_ptr + i * s, PTR(B[i]), SIZ(B[i]));
fft_k = mpn_fft_best_k (m0 * s, 0);
i = mpn_fft_next_size (m0 * s, fft_k);
/* the following loop ensures we don't cut in the middle of a
coefficient */
while (i % s)
i = mpn_fft_next_size (i + 1, fft_k);
m = i / s;
ASSERT(m <= 2 * m0 - 3 + list_mul_mem (m0 - 1));
t2_ptr = (mp_ptr) malloc ((i + 1) * sizeof (mp_limb_t));
if (t2_ptr == NULL)
{
free (t0_ptr);
free (t1_ptr);
return 0;
}
mpn_mul_fft (t2_ptr, i, t0_ptr, size_t0, t1_ptr, size_t1, fft_k);
for (i = 0, tp = t2_ptr, negative = 0; i < m; i++)
{
size_tmp = s;
if (negative) /* previous was negative, add 1 */
mpn_add_1 (tp, tp, s, (mp_limb_t) 1);
/* no need to check return value of mpn_add_1: if 1, then {tp, s}
is now identically 0, and should remain so */
MPN_NORMALIZE(tp, size_tmp);
if ((size_tmp == s) && (tp[s - 1] >> (mp_bits_per_limb - 1)))
{
negative = 1;
mpn_com_n (tp, tp, s);
mpn_add_1 (tp, tp, s, (mp_limb_t) 1);
}
else
negative = 0;
r_ptr = MPZ_REALLOC (R[i], size_tmp);
if (size_tmp)
MPN_COPY (r_ptr, tp, size_tmp);
SIZ(R[i]) = (negative) ? -size_tmp : size_tmp;
tp += s;
}
free (t0_ptr);
free (t1_ptr);
free (t2_ptr);
return m;
}
syntax highlighted by Code2HTML, v. 0.9.1