/* $Id: arith2.c,v 1.135 2006/04/08 10:48:32 kb Exp $

Copyright (C) 2000  The PARI group.

This file is part of the PARI/GP package.

PARI/GP is free software; you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation. It is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY WHATSOEVER.

Check the License for details. You should have received a copy of it, along
with the package; see the file 'COPYING'. If not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */

/*********************************************************************/
/**                                                                 **/
/**                     ARITHMETIC FUNCTIONS                        **/
/**                        (second part)                            **/
/**                                                                 **/
/*********************************************************************/
#include "pari.h"
#include "paripriv.h"
/***********************************************************************/
/**                                                                   **/
/**                          PRIME NUMBERS                            **/
/**                                                                   **/
/***********************************************************************/

/* assume all primes up to 59359 are precomputed */
GEN
prime(long n)
{
  byteptr p;
  ulong prime;

  if (n <= 0) pari_err(talker, "n-th prime meaningless if n = %ld",n);
  if (n < 1000) {
    p = diffptr;
    prime = 0;
  } else if (n < 2000) {
    n -= 1000; p = diffptr+1000;
    prime = 7919;
  } else if (n < 3000) {
    n -= 2000; p = diffptr+2000;
    prime = 17389;
  } else if (n < 4000) {
    n -= 3000; p = diffptr+3000;
    prime = 27449;
  } else if (n < 5000) {
    n -= 4000; p = diffptr+4000;
    prime = 37813;
  } else if (n < 6000) {
    n -= 5000; p = diffptr+5000;
    prime = 48611;
  } else if (n < 10000 || maxprime() < 500000) {
    n -= 6000; p = diffptr+6000;
    prime = 59359;
  } else if (n < 20000) {
    n -= 10000; p = diffptr+10000;
    prime = 104729;
  } else if (n < 30000) {
    n -= 20000; p = diffptr+20000;
    prime = 224737;
  } else if (n < 40000) {
    n -= 30000; p = diffptr+30000;
    prime = 350377;
  } else {
    n -= 40000; p = diffptr+40000;
    prime = 479909;
  }
  while (n--) NEXT_PRIME_VIADIFF_CHECK(prime,p);
  return utoipos(prime);
}

GEN
primepi(GEN x)
{
  pari_sp av = avma;
  byteptr p = diffptr;
  ulong prime = 0, res = 0, n;
  GEN N = typ(x) == t_INT? x: gfloor(x);

  if (typ(N) != t_INT || signe(N) <= 0) pari_err(typeer, "primepi");
  avma = av; n = itou(N); maxprime_check(n);
  while (prime <= n) { res++; NEXT_PRIME_VIADIFF(prime,p); }
  return utoi(res-1);
}

GEN
primes(long n)
{
  byteptr p = diffptr;
  ulong prime = 0;
  GEN y,z;

  if (n < 0) n = 0;
  z = y = cgetg(n+1,t_VEC);
  while (n--)
  {
    NEXT_PRIME_VIADIFF_CHECK(prime,p);
    gel(++z, 0) = utoi(prime);
  }
  return y;
}

/* Building/Rebuilding the diffptr table. The actual work is done by the
 * following two subroutines;  the user entry point is the function
 * initprimes() below.  initprimes1() is the old algorithm, called when
 * maxnum (size) is moderate. */
static byteptr
initprimes1(ulong size, long *lenp, long *lastp)
{
  long k;
  byteptr q, r, s, fin, p = (byteptr) gpmalloc(size+2);

  memset(p, 0, size + 2); fin = p + size;
  for (r=q=p,k=1; r<=fin; )
  {
    do { r+=k; k+=2; r+=k; } while (*++q);
    for (s=r; s<=fin; s+=k) *s = 1;
  }
  r = p; *r++ = 2; *r++ = 1; /* 2 and 3 */
  for (s=q=r-1; ; s=q)
  {
    do q++; while (*q);
    if (q > fin) break;
    *r++ = (unsigned char) ((q-s) << 1);
  }
  *r++ = 0;
  *lenp = r - p;
  *lastp = ((s - p) << 1) + 1;
  return (byteptr) gprealloc(p,r-p);
}

/*  Timing in ms (Athlon/850; reports 512K of secondary cache; looks
    like there is 64K of quickier cache too).

      arena|    30m     100m    300m    1000m    2000m  <-- primelimit
      =================================================
      16K       1.1053  1.1407  1.2589  1.4368   1.6086 
      24K       1.0000  1.0625  1.1320  1.2443   1.3095 
      32K       1.0000  1.0469  1.0761  1.1336   1.1776 
      48K       1.0000  1.0000  1.0254  1.0445   1.0546 
      50K       1.0000  1.0000  1.0152  1.0345   1.0464 
      52K       1.0000  1.0000  1.0203  1.0273   1.0362 
      54K       1.0000  1.0000  1.0812  1.0216   1.0281 
      56K       1.0526  1.0000  1.0051  1.0144   1.0205 
      58K       1.0000  1.0000  1.0000  1.0086   1.0123 
      60K       0.9473  0.9844  1.0051  1.0014   1.0055 
      62K       1.0000  0.9844  0.9949  0.9971   0.9993 
      64K       1.0000  1.0000  1.0000  1.0000   1.0000 
      66K       1.2632  1.2187  1.2183  1.2055   1.1953 
      68K       1.4211  1.4844  1.4721  1.4425   1.4188 
      70K       1.7368  1.7188  1.7107  1.6767   1.6421 
      72K       1.9474  1.9531  1.9594  1.9023   1.8573 
      74K       2.2105  2.1875  2.1827  2.1207   2.0650 
      76K       2.4211  2.4219  2.4010  2.3305   2.2644 
      78K       2.5789  2.6250  2.6091  2.5330   2.4571 
      80K       2.8421  2.8125  2.8223  2.7213   2.6380 
      84K       3.1053  3.1875  3.1776  3.0819   2.9802 
      88K       3.5263  3.5312  3.5228  3.4124   3.2992 
      92K       3.7895  3.8438  3.8375  3.7213   3.5971 
      96K       4.0000  4.1093  4.1218  3.9986   3.9659 
      112K      4.3684  4.5781  4.5787  4.4583   4.6115 
      128K      4.7368  4.8750  4.9188  4.8075   4.8997 
      192K      5.5263  5.7188  5.8020  5.6911   5.7064 
      256K      6.0000  6.2187  6.3045  6.1954   6.1033 
      384K      6.7368  6.9531  7.0405  6.9181   6.7912 
      512K      7.3158  7.5156  7.6294  7.5000   7.4654 
      768K      9.1579  9.4531  9.6395  9.5014   9.1075 
      1024K    10.368  10.7497 10.9999 10.878   10.8201
      1536K    12.579  13.3124 13.7660 13.747   13.4739
      2048K    13.737  14.4839 15.0509 15.151   15.1282
      3076K    14.789  15.5780 16.2993 16.513   16.3365

    Now the same number relative to the model

    (1 + 0.36*sqrt(primelimit)/arena) * (arena <= 64 ? 1.05 : (arena-64)**0.38)

     [SLOW2_IN_ROOTS = 0.36, ALPHA = 0.38]

      arena|    30m     100m    300m    1000m    2000m  <-- primelimit
      =================================================
        16K    1.014    0.9835  0.9942  0.9889  1.004
        24K    0.9526   0.9758  0.9861  0.9942  0.981
        32K    0.971    0.9939  0.9884  0.9849  0.9806
        48K    0.9902   0.9825  0.996   0.9945  0.9885
        50K    0.9917   0.9853  0.9906  0.9926  0.9907
        52K    0.9932   0.9878  0.9999  0.9928  0.9903
        54K    0.9945   0.9902  1.064   0.9939  0.9913
        56K    1.048    0.9924  0.9925  0.993   0.9921
        58K    0.9969   0.9945  0.9909  0.9932  0.9918
        60K    0.9455   0.9809  0.9992  0.9915  0.9923
        62K    0.9991   0.9827  0.9921  0.9924  0.9929
        64K    1        1       1       1       1
        66K    1.02     0.9849  0.9857  0.9772  0.9704
        68K    0.8827   0.9232  0.9176  0.9025  0.8903
        70K    0.9255   0.9177  0.9162  0.9029  0.8881
        72K    0.9309   0.936   0.9429  0.9219  0.9052
        74K    0.9715   0.9644  0.967   0.9477  0.9292
        76K    0.9935   0.9975  0.9946  0.9751  0.9552
        78K    0.9987   1.021   1.021   1.003   0.9819
        80K    1.047    1.041   1.052   1.027   1.006
        84K    1.052    1.086   1.092   1.075   1.053
        88K    1.116    1.125   1.133   1.117   1.096
        92K    1.132    1.156   1.167   1.155   1.134
        96K    1.137    1.177   1.195   1.185   1.196
       112K    1.067    1.13    1.148   1.15    1.217
       128K    1.04     1.083   1.113   1.124   1.178
       192K    0.9368   0.985   1.025   1.051   1.095
       256K    0.8741   0.9224  0.9619  0.995   1.024
       384K    0.8103   0.8533  0.8917  0.9282  0.9568
       512K    0.7753   0.8135  0.8537  0.892   0.935
       768K    0.8184   0.8638  0.9121  0.9586  0.9705
      1024K    0.8241   0.8741  0.927   0.979   1.03
      1536K    0.8505   0.9212  0.9882  1.056   1.096
      2048K    0.8294   0.8954  0.9655  1.041   1.102

*/

#ifndef SLOW2_IN_ROOTS
  /* SLOW2_IN_ROOTS below 3: some slowdown starts to be noticable
   * when things fit into the cache on Sparc.
   * XXX The choice of 2.6 gives a slowdown of 1-2% on UltraSparcII,
   * but makes calculations for "maximum" of 436273009 (not applicable anymore)
   * fit into 256K cache (still common for some architectures).
   *
   * One may change it when small caches become uncommon, but the gain
   * is not going to be very noticable... */
#  ifdef i386           /* gcc defines this? */
#    define SLOW2_IN_ROOTS      0.36
#  else
#    define SLOW2_IN_ROOTS      2.6
#  endif
#endif
#ifndef CACHE_ARENA
#  ifdef i386           /* gcc defines this? */
   /* Due to smaller SLOW2_IN_ROOTS, smaller arena is OK; fit L1 cache */
#    define CACHE_ARENA (63 * 1024UL) /* No slowdown even with 64K L1 cache */
#  else
#    define CACHE_ARENA (200 * 1024UL) /* No slowdown even with 256K L2 cache */
#  endif
#endif

#define CACHE_ALPHA     (0.38)          /* Cache performance model parameter */
#define CACHE_CUTOFF    (0.018)         /* Cache performance not smooth here */

static double slow2_in_roots = SLOW2_IN_ROOTS;

typedef struct {
    ulong arena;
    double power;
    double cutoff;
} cache_model_t;

static cache_model_t cache_model = { CACHE_ARENA, CACHE_ALPHA, CACHE_CUTOFF };

/* Assume that some calculation requires a chunk of memory to be
   accessed often in more or less random fashion (as in sieving).
   Assume that the calculation can be done in steps by subdividing the
   chunk into smaller subchunks (arenas) and treathing them
   separately.  Assume that the overhead of subdivision is equivalent
   to the number of arenas.

   Find an optimal size of the arena taking into account the overhead
   of subdivision, and the overhead of arena not fitting into the
   cache.  Assume that arenas of size slow2_in_roots slows down the
   calculation 2x (comparing to very big arenas; when cache hits do
   not matter).  Since cache performance varies wildly with
   architecture, load, and wheather (especially with cache coloring
   enabled), use an idealized cache model based on benchmarks above.

   Assume that an independent region of FIXED_TO_CACHE bytes is accessed
   very often concurrently with the arena access.
 */
static ulong
good_arena_size(ulong slow2_size, ulong total, ulong fixed_to_cache,
                cache_model_t *cache_model, long model_type)
{
  ulong asize, cache_arena = cache_model->arena;
  double Xmin, Xmax, A, B, C1, C2, D, V;
  double alpha = cache_model->power, cut_off = cache_model->cutoff;

  /* Estimated relative slowdown,
     with overhead = max((fixed_to_cache+arena)/cache_arena - 1, 0):

     1 + slow2_size/arena due to initialization overhead;

     max(1, 4.63 * overhead^0.38 ) due to footprint > cache size.

     [The latter is hard to substantiate theoretically, but this
     function describes benchmarks pretty close; it does not hurt that
     one can minimize it explicitly too ;-).  The switch between
     diffenent choices of max() happens whe overhead=0.018.]

     Thus the problem is minimizing (1 + slow2_size/arena)*overhead**0.29.
     This boils down to F=((X+A)/(X+B))X^alpha, X=overhead, 
     B = (1 - fixed_to_cache/cache_arena), A = B +
     slow2_size/cache_arena, alpha = 0.38, and X>=0.018, X>-B.  It
     turns out that the minimization problem is not very nasty.  (As
     usual with minimization problems which depend on parameters,
     different values of A,B lead to different algorithms.  However,
     the boundaries between the domains in (A,B) plane where different
     algorithms work are explicitly calculatable.)

     Thus we need to find the rightmost root of (X+A)*(X+B) -
     alpha(A-B)X to the right of 0.018 (if such exists and is below
     Xmax).  Then we manually check the remaining region [0, 0.018].

     Since we cannot trust the purely-experimental cache-hit slowdown
     function, as a sanity check always prefer fitting into the
     cache (or "almost fitting") if F-law predicts that the larger
     value of the arena provides less than 10% speedup.

   */

  if (model_type != 0)
      pari_err(talker, "unsupported type of cache model"); /* Future expansion */

  /* The simplest case: we fit into cache */
  if (total + fixed_to_cache <= cache_arena)
      return total;
  /* The simple case: fitting into cache doesn't slow us down more than 10% */
  if (cache_arena - fixed_to_cache > 10 * slow2_size) {
      asize = cache_arena - fixed_to_cache;
      if (asize > total) asize = total; /* Automatically false... */
      return asize;
  }
  /* Slowdown of not fitting into cache is significant.  Try to optimize.
     Do not be afraid to spend some time on optimization - in trivial
     cases we do not reach this point; any gain we get should
     compensate the time spent on optimization.  */

  B = (1 - ((double)fixed_to_cache)/cache_arena);
  A = B + ((double)slow2_size)/cache_arena;
  C2 = A*B;
  C1 = (A + B - 1/alpha*(A - B))/2;
  D = C1*C1 - C2;
  if (D > 0)
      V = cut_off*cut_off + 2*C1*cut_off + C2; /* Value at CUT_OFF */
  else
      V = 0;                            /* Peacify the warning */
  Xmin = cut_off;
  Xmax = ((double)total - fixed_to_cache)/cache_arena; /* Two candidates */

  if ( D <= 0 || (V >= 0 && C1 + cut_off >= 0) ) /* slowdown increasing */
      Xmax = cut_off;                   /* Only one candidate */
  else if (V >= 0 &&                    /* slowdown concave down */
           ((Xmax + C1) <= 0 || (Xmax*Xmax + 2*C1*Xmax + C2) <= 0))
      /* DO NOTHING */;                 /* Keep both candidates */
  else if (V <= 0 && (Xmax*Xmax + 2*C1*Xmax + C2) <= 0) /* slowdown decreasing */
      Xmin = cut_off;                   /* Only one candidate */
  else /* Now we know: 2 roots, the largest is in CUT_OFF..Xmax */
      Xmax = sqrt(D) - C1;
  if (Xmax != Xmin) {   /* Xmin == CUT_OFF; Check which one is better */
      double v1 = (cut_off + A)/(cut_off + B);
      double v2 = 2.33 * (Xmax + A)/(Xmax + B) * pow(Xmax, alpha);

      if (1.1 * v2 >= v1) /* Prefer fitting into the cache if slowdown < 10% */
          V = v1;
      else {
          Xmin = Xmax;
          V = v2;
      }
  } else if (B > 0)                     /* We need V */
      V = 2.33 * (Xmin + A)/(Xmin + B) * pow(Xmin, alpha);
  if (B > 0 && 1.1 * V > A/B)  /* Now Xmin is the minumum.  Compare with 0 */
      Xmin = 0;

  asize = (ulong)((1 + Xmin)*cache_arena - fixed_to_cache);
  if (asize > total) asize = total;     /* May happen due to approximations */
  return asize;
}

/* Use as in
    install(set_optimize,lLDG)          \\ Through some M too?
    set_optimize(2,1) \\ disable dependence on limit
    \\ 1: how much cache usable, 2: slowdown of setup, 3: alpha, 4: cutoff
    \\ 2,3,4 are in units of 0.001

    { time_primes_arena(ar,limit) =     \\ ar = arena size in K
        set_optimize(1,floor(ar*1024));
        default(primelimit, 200 000);   \\ 100000 results in *larger* malloc()!
        gettime;
        default(primelimit, floor(limit));
        if(ar >= 1, ar=floor(ar));
        print("arena "ar"K => "gettime"ms");
    }
*/
long
set_optimize(long what, GEN g)
{
  long ret = 0;

  switch (what) {
  case 1:
    ret = (long)cache_model.arena;
    break;
  case 2:
    ret = (long)(slow2_in_roots * 1000);
    break;
  case 3:
    ret = (long)(cache_model.power * 1000);
    break;
  case 4:
    ret = (long)(cache_model.cutoff * 1000);
    break;
  default:
    pari_err(talker, "panic: set_optimize");
    break;
  }
  if (g != NULL) {
    ulong val = itou(g);

    switch (what) {
    case 1: cache_model.arena = val; break;
    case 2: slow2_in_roots     = (double)val / 1000.; break;
    case 3: cache_model.power  = (double)val / 1000.; break;
    case 4: cache_model.cutoff = (double)val / 1000.; break;
    }
  }
  return ret;
}

static void
sieve_chunk(byteptr known_primes, ulong s, byteptr data, ulong count)
{   /* start must be odd;
       prime differences (starting from (5-3)=2) start at known_primes[2],
       are terminated by a 0 byte;

       Checks cnt odd numbers starting at `start', setting bytes
       starting at data to 0 or 1 depending on whether the
       corresponding odd number is prime or not */
    ulong p;
    byteptr q;
    register byteptr write_to = data;   /* Better code with gcc 2.8.1 */
    register ulong   cnt      = count;  /* Better code with gcc 2.8.1 */
    register ulong   start    = s;      /* Better code with gcc 2.8.1 */
    register ulong   delta    = 1;      /* Better code with gcc 2.8.1 */

    memset(data, 0, cnt);
    start >>= 1;                        /* (start - 1)/2 */
    start += cnt;                       /* Corresponds to the end */
    cnt -= 1;
    /* data corresponds to start.  q runs over primediffs.  */
    /* Don't care about DIFFPTR_SKIP: false positives provide no problem */
    for (q = known_primes + 1, p = 3; delta; delta = *++q, p += delta) {
        /* first odd number which is >= start > p and divisible by p
           = last odd number which is <= start + 2p - 1 and 0 (mod p)
           = p + the last even number which is <= start + p - 1 and 0 (mod p)
           = p + the last even number which is <= start + p - 2 and 0 (mod p)
           = p + start + p - 2 - (start + p - 2) % 2p
           = start + 2(p - 1 - ((start-1)/2 + (p-1)/2) % p). */
      long off = cnt - ((start+(p>>1)) % p);

      while (off >= 0) {
          write_to[off] = 1;
          off -= p;
      }
    }
}

/* Do not inline sieve_chunk()!  It fits into registers in ix86 non-inlined */
void (*sieve_chunk_p)(byteptr known_primes, ulong s, byteptr data, ulong count)
    = sieve_chunk;

/* Here's the workhorse.  This is recursive, although normally the first
   recursive call will bottom out and invoke initprimes1() at once.
   (Not static;  might conceivably be useful to someone in library mode) */
byteptr
initprimes0(ulong maxnum, long *lenp, ulong *lastp)
{
  long size, alloced, psize;
  byteptr q, fin, p, p1, fin1, plast, curdiff;
  ulong last, remains, curlow, rootnum, asize;
  ulong prime_above = 3;
  byteptr p_prime_above;

  if (maxnum <= 1ul<<17)        /* Arbitrary. */
    return initprimes1(maxnum>>1, lenp, (long*)lastp); /* Break recursion */

  maxnum |= 1;                  /* make it odd. */

  /* Checked to be enough up to 40e6, attained at 155893 */
  /* Due to multibyte representation of large gaps, this estimate will
     be broken by large enough maxnum.  However, assuming exponential
     distribution of gaps with the average log(n), we are safe up to
     circa exp(-256/log(1/0.09)) = 1.5e46.  OK with LONG_BITS <= 128. ;-) */
  size = (long) (1.09 * maxnum/log((double)maxnum)) + 146;
  p1 = (byteptr) gpmalloc(size);
  rootnum = (ulong) sqrt((double)maxnum); /* cast it back to a long */
  rootnum |= 1;
  {
    byteptr p2 = initprimes0(rootnum, &psize, &last); /* recursive call */
    memcpy(p1, p2, psize); free(p2);
  }
  fin1 = p1 + psize - 1;
  remains = (maxnum - rootnum) >> 1; /* number of odd numbers to check */

  /* Actually, we access primes array of psize too; but we access it
     consequently, thus we do not include it in fixed_to_cache */
  asize = good_arena_size((ulong)(rootnum * slow2_in_roots), remains + 1, 0,
                          &cache_model, 0) - 1;
  /* enough room on the stack ? */
  alloced = (((byteptr)avma) <= ((byteptr)bot) + asize);
  if (alloced)
    p = (byteptr) gpmalloc(asize + 1);
  else
    p = (byteptr) bot;
  fin = p + asize;              /* the 0 sentinel goes at fin. */
  curlow = rootnum + 2; /* First candidate: know primes up to rootnum (odd). */
  curdiff = fin1;

  /* During each iteration p..fin-1 represents a range of odd
     numbers.  plast is a pointer which represents the last prime seen,
     it may point before p..fin-1. */
  plast = p - ((rootnum - last) >> 1) - 1;
  p_prime_above = p1 + 2;
  while (remains)       /* Cycle over arenas.  Performance is not crucial */
  {
    unsigned char was_delta;

    if (asize > remains) {
      asize = remains;
      fin = p + asize;
    }
    /* Fake the upper limit appropriate for the given arena */
    while (prime_above*prime_above <= curlow + (asize << 1) && *p_prime_above)
        prime_above += *p_prime_above++;
    was_delta = *p_prime_above;
    *p_prime_above = 0;                 /* Put a 0 sentinel for sieve_chunk */

    (*sieve_chunk_p)(p1, curlow, p, asize);

    *p_prime_above = was_delta;         /* Restore */
    p[asize] = 0;                       /* Put a 0 sentinel for ZZZ */
    /* now q runs over addresses corresponding to primes */
    for (q = p; ; plast = q++)
    {
      long d;

      while (*q) q++;                   /* use ZZZ 0-sentinel at end */
      if (q >= fin) break;
      d = (q - plast) << 1;
      while (d >= DIFFPTR_SKIP)
        *curdiff++ = DIFFPTR_SKIP, d -= DIFFPTR_SKIP;
      *curdiff++ = (unsigned char)d;
    }
    plast -= asize;
    remains -= asize;
    curlow += (asize<<1);
  } /* while (remains) */
  last = curlow - ((p - plast) << 1);
  *curdiff++ = 0;               /* sentinel */
  *lenp = curdiff - p1;
  *lastp = last;
  if (alloced) free(p);
  return (byteptr) gprealloc(p1, *lenp);
}
#if 0 /* not yet... GN */
/* The diffptr table will contain at least 6548 entries (up to and including
   the 6547th prime, 65557, and the terminal null byte), because the isprime/
   small-factor-extraction machinery wants to depend on everything up to 65539
   being in the table, and we might as well go to a multiple of 4 Bytes.--GN */

void
init_tinyprimes_tridiv(byteptr p);      /* in ifactor2.c */
#endif

static ulong _maxprime = 0;

ulong
maxprime(void) { return _maxprime; }

void
maxprime_check(ulong c)
{
  if (_maxprime < c) pari_err(primer2, c);
}

byteptr
initprimes(ulong maxnum)
{
  long len;
  ulong last;
  byteptr p;
  /* The algorithm must see the next prime beyond maxnum, whence the +512. */
  ulong maxnum1 = ((maxnum<65302?65302:maxnum)+512ul);

  if ((maxnum>>1) > VERYBIGINT - 1024)
      pari_err(talker, "Too large primelimit");
  p = initprimes0(maxnum1, &len, &last);
#if 0 /* not yet... GN */
  static int build_the_tables = 1;
  if (build_the_tables) { init_tinyprimes_tridiv(p); build_the_tables=0; }
#endif
  _maxprime = last; return p;
}

static void
cleanprimetab(void)
{
  long i,j, lp = lg(primetab);

  for (i=j=1; i < lp; i++)
    if (primetab[i]) primetab[j++] = primetab[i];
  setlg(primetab,j);
}

/* p is a single element or a row vector with "primes" to add to primetab.
 * If p shares a non-trivial factor with another element in primetab, take it
 * into account. */
GEN
addprimes(GEN p)
{
  pari_sp av;
  long i,k,tx,lp;
  GEN L;

  if (!p) return primetab;
  tx = typ(p);
  if (is_vec_t(tx))
  {
    for (i=1; i < lg(p); i++) (void)addprimes(gel(p,i));
    return primetab;
  }
  if (tx != t_INT) pari_err(typeer,"addprime");
  if (is_pm1(p)) return primetab;
  av = avma; i = signe(p);
  if (i == 0) pari_err(talker,"can't accept 0 in addprimes");
  if (i < 0) p = absi(p);

  lp = lg(primetab);
  L = cgetg(2*lp,t_VEC); k = 1;
  for (i=1; i < lp; i++)
  {
    GEN n = gel(primetab,i), d = gcdii(n, p);
    if (! is_pm1(d))
    {
      if (!equalii(p,d)) gel(L,k++) = d;
      gel(L,k++) = diviiexact(n,d);
      gunclone(n); primetab[i] = 0;
    }
  }
  primetab = (GEN) gprealloc(primetab, (lp+1)*sizeof(long));
  gel(primetab,i) = gclone(p); setlg(primetab, lp+1);
  if (k > 1) { cleanprimetab(); setlg(L,k); (void)addprimes(L); }
  avma = av; return primetab;
}

static GEN
removeprime(GEN prime)
{
  long i;

  if (typ(prime) != t_INT) pari_err(typeer,"removeprime");
  for (i=lg(primetab) - 1; i; i--)
    if (absi_equal(gel(primetab,i), prime))
    {
      gunclone(gel(primetab,i)); primetab[i]=0;
      cleanprimetab(); break;
    }
  if (!i) pari_err(talker,"prime %Z is not in primetable", prime);
  return primetab;
}

GEN
removeprimes(GEN prime)
{
  long i,tx;

  if (!prime) return primetab;
  tx = typ(prime);
  if (is_vec_t(tx))
  {
    if (prime == primetab)
    {
      for (i=1; i < lg(prime); i++) gunclone(gel(prime,i));
      setlg(prime, 1);
    }
    else
    {
      for (i=1; i < lg(prime); i++) (void)removeprime(gel(prime,i));
    }
    return primetab;
  }
  return removeprime(prime);
}

/***********************************************************************/
/**                                                                   **/
/**       COMPUTING THE MATRIX OF PRIME DIVISORS AND EXPONENTS        **/
/**                                                                   **/
/***********************************************************************/
static const long decomp_default_hint = 0;

/* where to stop trial dividing in factorization */
static ulong
default_bound(GEN n, ulong all)
{
  ulong l;
  if (all > 1) return all; /* use supplied limit */
  if (!all) return (ulong)VERYBIGINT; /* smallfact() case */
  l = (ulong)expi(n) + 1;
  if (l <= 32)  return 1UL<<14;
  if (l <= 512) return (l-16) << 10;
  return 1UL<<19; /* Rho is generally faster above this */
}
static ulong
tridiv_bound(GEN n, ulong all)
{
  ulong p = maxprime(), l = default_bound(n, all);
  return min(p, l);
}

static GEN
aux_end(GEN n, long nb)
{
  GEN p1,p2, z = (GEN)avma;
  long i;

  if (n) gunclone(n);
  p1 = cgetg(nb+1,t_COL);
  p2 = cgetg(nb+1,t_COL);
  for (i=nb; i; i--)
  {
    gel(p2,i) = z; z += lg(z);
    gel(p1,i) = z; z += lg(z);
  }
  gel(z,1) = p1;
  gel(z,2) = p2;
  if (nb) (void)sort_factor_gen(z, absi_cmp);
  return z;
}

static GEN
auxdecomp1(GEN n, long (*ifac_break)(GEN n, GEN pairs, GEN here, GEN state),
                  GEN state, ulong all, long hint)
{
  pari_sp av;
  long pp[] = { evaltyp(t_INT)|_evallg(4), 0,0,0 };
  long nb = 0, i, lp;
  ulong p, k, lim;
  byteptr d = diffptr+1; /* start at p = 3 */
#define STORE(x,e) { nb++; (void)x; (void)utoipos(e); }
#define STOREu(x,e) STORE(utoipos(x), e)
#define STOREi(x,e) STORE(icopy(x),   e)

  if (typ(n) != t_INT) pari_err(arither1);
  i = signe(n); if (!i) pari_err(talker, "zero argument in factorint");
  (void)cgetg(3,t_MAT);
  if (i < 0) STORE(utoineg(1), 1);
  if (is_pm1(n)) return aux_end(NULL,nb);

  n = gclone(n); setsigne(n,1);
  i = vali(n);
  if (i)
  {
    STOREu(2, i);
    av = avma; affii(shifti(n,-i), n); avma = av;
  }
  if (is_pm1(n)) return aux_end(n,nb);
  lim = tridiv_bound(n, all);

  /* trial division */
  p = 2;
  for(;;)
  {
    int stop;
    NEXT_PRIME_VIADIFF(p,d);
    if (p >= lim) break;

    k = Z_lvalrem_stop(n, p, &stop);
    if (k) STOREu(p, k);
    if (stop)
    {
      if (!is_pm1(n)) STOREi(n, 1);
      return aux_end(n,nb);
    }
  }

  /* pp = square of biggest p tried so far */
  av = avma; affii(muluu(p,p), pp); avma = av;

  /* trial divide by the "special primes" (usually huge composites) */
  lp = lg(primetab);
  for (i=1; i<lp; i++)
    if (dvdiiz(n,gel(primetab,i), n))
    {
      long k = 1; while (dvdiiz(n,gel(primetab,i), n)) k++;
      STOREi(gel(primetab,i), k);
      if (absi_cmp(pp, n) > 0)
      {
        if (!is_pm1(n)) STOREi(n, 1);
        return aux_end(n,nb);
      }
    }

  if (all != 1) hint = 15; /* smallfact: turn off all except pure powers */
  else /* else test primality */
    if (BSW_psp(n)) { STOREi(n, 1); return aux_end(n,nb); }

  /* now we have a large composite */
  if (ifac_break && (*ifac_break)(n,NULL,NULL,state)) /*initialize ifac_break*/
  {
    if (DEBUGLEVEL>2)
      fprintferr("IFAC: (Partial fact.) Initial stop requested.\n");
  }
  else
    nb += ifac_decomp_break(n, ifac_break, state, hint);

  return aux_end(n, nb);
}

/* state[1]: current unfactored part.
 * state[2]: limit. */
static long
ifac_break_limit(GEN n, GEN pairs/*unused*/, GEN here, GEN state)
{
  pari_sp ltop = avma;
  GEN N;
  int res;
  (void)pairs;
  if (!here) /* initial call */
   /*Small primes have been removed, n is the new unfactored part.*/
    N = n;
  else
  {
    GEN q = powgi(gel(here,0),gel(here,1)); /* primary factor found.*/
    if (DEBUGLEVEL>2) fprintferr("IFAC: Stop: Primary factor: %Z\n",q);
    N = diviiexact(gel(state,1),q); /* divide unfactored part by q */
  }
  affii(N, gel(state,1)); /* affect()ed to state[1] to preserve stack. */
  if (DEBUGLEVEL>2) fprintferr("IFAC: Stop: remaining %Z\n",state[1]);
  /* check the stopping criterion, then restore stack */
  res = cmpii(gel(state,1),gel(state,2)) <= 0;
  avma = ltop; return res;
}

static GEN
auxdecomp0(GEN n, ulong all, long hint)
{
  return auxdecomp1(n, NULL, gen_0, all, hint);
}

GEN
auxdecomp(GEN n, long all)
{
  return auxdecomp0(n,all,decomp_default_hint);
}

/* see before ifac_crack() in ifactor1.c for current semantics of `hint'
   (factorint's `flag') */
GEN
factorint(GEN n, long flag)
{
  return auxdecomp0(n,1,flag);
}

GEN
Z_factor(GEN n)
{
  return auxdecomp0(n,1,decomp_default_hint);
}

/* Factor until the unfactored part is smaller than limit. */
GEN
Z_factor_limit(GEN n, GEN limit)
{
  GEN state = cgetg(3,t_VEC);
 /* icopy is mainly done to allocate memory for affect().
  * Currently state[1] is discarded in initial call to ifac_break_limit */
  gel(state,1) = icopy(n);
  gel(state,2) = gcopy(limit);
  return auxdecomp1(n, &ifac_break_limit, state, 1, decomp_default_hint);
}

GEN
smallfact(GEN n)
{
  return boundfact(n,0);
}

GEN
gboundfact(GEN n, long lim)
{
  return garith_proto2gs(boundfact,n,lim);
}

GEN
boundfact(GEN n, long lim)
{
  GEN p1, p2;
  pari_sp av = avma;

  if (lim <= 1) lim = 0;
  switch(typ(n))
  {
    case t_INT: return auxdecomp(n,lim);
    case t_FRAC:
      p1 = auxdecomp(gel(n,1),lim);
      p2 = auxdecomp(gel(n,2),lim); gel(p2,2) = gneg_i(gel(p2,2));
      return gerepilecopy(av, merge_factor_i(p1,p2));
  }
  pari_err(arither1);
  return NULL; /* not reached */
}
/***********************************************************************/
/**                                                                   **/
/**                    SIMPLE FACTORISATIONS                          **/
/**                                                                   **/
/***********************************************************************/

/* Factor n and output [p,e] where
 * p, e are vecsmall with n = prod{p[i]^e[i]} */
GEN
factoru(ulong n)
{
  pari_sp ltop = avma;
  GEN F = Z_factor(utoi(n)), P = gel(F,1), E = gel(F,2);
  long i, l = lg(P);
  GEN f = cgetg(3,t_VEC);
  GEN p = cgetg(l,t_VECSMALL);
  GEN e = cgetg(l,t_VECSMALL);
  pari_sp lbot = avma;
  gel(f,1) = p;
  gel(f,2) = e;
  for (i = 1; i < l; i++)
  {
    p[i] = itou(gel(P,i));
    e[i] = itou(gel(E,i));
  }
  avma = lbot; return gerepileupto(ltop,f);
}
/* Factor n and output [p,e,c] where
 * p, e and c are vecsmall with n = prod{p[i]^e[i]} and c[i] = p[i]^e[i] */
GEN
factoru_pow(ulong n)
{
  pari_sp ltop = avma;
  GEN F = Z_factor(utoi(n)), P = gel(F,1), E = gel(F,2);
  long i, l = lg(P);
  GEN f = cgetg(4,t_VEC);
  GEN p = cgetg(l,t_VECSMALL);
  GEN e = cgetg(l,t_VECSMALL);
  GEN c = cgetg(l,t_VECSMALL);
  pari_sp lbot = avma;
  gel(f,1) = p;
  gel(f,2) = e;
  gel(f,3) = c;
  for(i = 1; i < l; i++)
  {
    p[i] = itou(gel(P,i));
    e[i] = itou(gel(E,i));
    c[i] = itou(powiu(gel(P,i), e[i]));
  }
  avma = lbot; return gerepileupto(ltop,f);
}

/***********************************************************************/
/**                                                                   **/
/**                    BASIC ARITHMETIC FUNCTIONS                     **/
/**                                                                   **/
/***********************************************************************/

GEN
gmu(GEN n) { return arith_proto(mu,n,1); }

INLINE void
chk_arith(GEN n) {
  if (typ(n) != t_INT) pari_err(arither1);
  if (!signe(n)) pari_err(talker, "zero argument in an arithmetic function");
}

long
mu(GEN n)
{
  byteptr d = diffptr+1; /* point at 3 - 2 */
  pari_sp av = avma;
  ulong p, lim;
  long s, v;

  chk_arith(n); if (is_pm1(n)) return 1;
  if (equaliu(n, 2)) return -1;
  p = mod4(n); if (!p) return 0;
  if (p == 2) { s = -1; n = shifti(n, -1); } else { s = 1; n = icopy(n); }
  setsigne(n, 1);

  lim = tridiv_bound(n,1);
  p = 2;
  while (p < lim)
  {
    int stop;
    NEXT_PRIME_VIADIFF(p,d);
    v = Z_lvalrem_stop(n, p, &stop);
    if (v > 1) { avma = av; return 0; }
    if (v) s = -s;
    if (stop) { avma = av; return is_pm1(n)? s: -s; }
  }
  if (BSW_psp(n)) { avma=av; return -s; }
  /* large composite without small factors */
  v = ifac_moebius(n, decomp_default_hint);
  avma = av; return (s<0 ? -v : v); /* correct also if v==0 */
}

GEN
gissquarefree(GEN x) { return arith_proto(issquarefree,x,0); }

long 
Z_issquarefree(GEN x)
{
  byteptr d = diffptr+1;
  pari_sp av = avma;
  ulong p, lim;
  long v;

  if (!signe(x)) return 0;
  if (cmpiu(x, 2) <= 0) return 1;
  p = mod4(x); if (!p) return 0;
  x = (p == 2)? shifti(x, -1): icopy(x); 
  setsigne(x, 1);

  lim = tridiv_bound(x,1);
  p = 2;
  while (p < lim)
  {
    int stop;
    NEXT_PRIME_VIADIFF(p,d);
    v = Z_lvalrem_stop(x, p, &stop);
    if (v > 1) { avma = av; return 0; }
    if (stop) { avma = av; return 1; }
  }
  if (BSW_psp(x)) { avma = av; return 1; }
  v = ifac_issquarefree(x, decomp_default_hint);
  avma = av; return v;
}

long
issquarefree(GEN x)
{
  pari_sp av;
  GEN d;
  switch(typ(x))
  {
    case t_INT: return Z_issquarefree(x);
    case t_POL:
      if (!signe(x)) return 0;
      av = avma; d = ggcd(x, derivpol(x));
      avma = av; return (lg(d) == 3);
    default: pari_err(typeer,"issquarefree");
      return 0; /* not reached */
  }
}

GEN
gomega(GEN n) { return arith_proto(omega,n,1); }

long
omega(GEN n)
{
  byteptr d = diffptr+1;
  pari_sp av = avma;
  long nb,v;
  ulong p, lim;

  chk_arith(n); if (is_pm1(n)) return 0;
  v = vali(n); nb = v ? 1 : 0;
  n = shifti(n, -v);
  if (is_pm1(n)) return nb;
  setsigne(n, 1);

  lim = tridiv_bound(n,1);
  p = 2;
  while (p < lim)
  {
    int stop;
    NEXT_PRIME_VIADIFF(p,d);
    v = Z_lvalrem_stop(n, p, &stop);
    if (v) nb++;
    if (stop) { avma = av; return is_pm1(n)? nb: nb+1; }
  }
  if (BSW_psp(n)) { avma = av; return nb+1; }
  /* large composite without small factors */
  nb += ifac_omega(n, decomp_default_hint);
  avma = av; return nb;
}

GEN
gbigomega(GEN n) { return arith_proto(bigomega,n,1); }

long
bigomega(GEN n)
{
  byteptr d=diffptr+1;
  pari_sp av = avma;
  ulong p, lim;
  long nb,v;

  chk_arith(n); if (is_pm1(n)) return 0;
  nb = v = vali(n); n = shifti(n, -v);
  if (is_pm1(n)) { avma = av; return nb; }
  setsigne(n, 1);

  lim = tridiv_bound(n,1);
  p = 2;
  while (p < lim)
  {
    int stop;
    NEXT_PRIME_VIADIFF(p,d);
    v = Z_lvalrem_stop(n, p, &stop);
    nb += v;
    if (stop) { avma = av; return is_pm1(n)? nb: nb+1; }
  }
  if (BSW_psp(n)) { avma = av; return nb+1; }
  nb += ifac_bigomega(n, decomp_default_hint);
  avma = av; return nb;
}

GEN
gphi(GEN n) { return garith_proto(phi,n,1); }

GEN
phi(GEN n)
{
  byteptr d = diffptr+1;
  pari_sp av = avma;
  GEN m;
  ulong p, lim;
  long v;

  chk_arith(n); if (is_pm1(n)) return gen_1;
  v = vali(n); n = shifti(n,-v); setsigne(n, 1);
  m = v > 1 ? int2n(v-1) : gen_1;
  if (is_pm1(n)) return gerepileuptoint(av,m);

  lim = tridiv_bound(n,1);
  p = 2;
  while (p < lim)
  {
    int stop;
    NEXT_PRIME_VIADIFF(p,d);
    v = Z_lvalrem_stop(n, p, &stop);
    if (v) {
      m = mulis(m, p-1);
      if (v > 2) m = mulii(m, powuu(p, v-1));
      else if (v == 2) m = mulis(m, p);
    }
    if (stop) { 
      if (!is_pm1(n)) m = mulii(m, addis(n,-1));
      return gerepileuptoint(av,m);
    }
  }
  if (BSW_psp(n)) return gerepileuptoint(av, mulii(m, addis(n,-1)));
  m = mulii(m, ifac_totient(n, decomp_default_hint));
  return gerepileuptoint(av,m);
}

GEN
gnumbdiv(GEN n) { return garith_proto(numbdiv,n,1); }

GEN
numbdiv(GEN n)
{
  byteptr d = diffptr+1;
  pari_sp av = avma;
  GEN m;
  long v;
  ulong p, lim;

  chk_arith(n); if (is_pm1(n)) return gen_1;
  v = vali(n); n = shifti(n,-v); setsigne(n,1);
  m = utoipos(v+1);
  if (is_pm1(n)) return gerepileuptoint(av,m);

  lim = tridiv_bound(n,1);
  p = 2;
  while (p < lim)
  {
    int stop;
    NEXT_PRIME_VIADIFF(p,d);
    v = Z_lvalrem_stop(n, p, &stop);
    if (v) m = mulis(m, v+1);
    if (stop)
    {
      if (!is_pm1(n)) m = shifti(m,1);
      return gerepileuptoint(av,m);
    }
  }
  if(BSW_psp(n)) return gerepileuptoint(av, shifti(m,1));
  m = mulii(m, ifac_numdiv(n, decomp_default_hint));
  return gerepileuptoint(av,m);
}

GEN
gsumdiv(GEN n) { return garith_proto(sumdiv,n,1); }

GEN
sumdiv(GEN n)
{
  byteptr d = diffptr+1;
  pari_sp av = avma;
  GEN m;
  ulong p, lim;
  long v;

  chk_arith(n); if (is_pm1(n)) return gen_1;
  v = vali(n); n = shifti(n,-v); setsigne(n,1);
  m = v ? addsi(-1, int2n(v+1)) : gen_1;
  if (is_pm1(n)) return gerepileuptoint(av,m);

  lim = tridiv_bound(n,1);
  p = 2;
  while (p < lim)
  {
    int stop;
    NEXT_PRIME_VIADIFF(p,d);
    v = Z_lvalrem_stop(n, p, &stop);
    if (v)
    {
      GEN m1 = utoipos(p+1);
      long i;
      for (i = 1; i < v; i++) m1 = addsi(1, mului(p,m1));
      m = mulii(m1,m);
    }
    if (stop)
    {
      if (!is_pm1(n)) m = mulii(m,addis(n,1));
      return gerepileuptoint(av, m);
    }
  }
  if(BSW_psp(n)) return gerepileuptoint(av, mulii(m,addsi(1,n)));
  m = mulii(m, ifac_sumdiv(n, decomp_default_hint));
  return gerepileuptoint(av,m);
}

GEN
gsumdivk(GEN n, long k) { return garith_proto2gs(sumdivk,n,k); }

GEN
sumdivk(GEN n, long k)
{
  byteptr d = diffptr+1;
  pari_sp av = avma;
  GEN n1, m;
  ulong p, lim;
  long k1,v;

  if (!k) return numbdiv(n);
  if (k == 1) return sumdiv(n);
  chk_arith(n); if (is_pm1(n)) return gen_1;
  k1 = k; n1 = n;
  if (k < 0)  k = -k;
  if (k == 1) { m = sumdiv(n); goto fin; }
  v = vali(n); n = shifti(n,-v); setsigne(n,1);
  m = gen_1;
  while (v--)  m = addsi(1,shifti(m,k));
  if (is_pm1(n)) goto fin;

  lim = tridiv_bound(n,1);
  p = 2;
  while (p < lim)
  {
    int stop;
    NEXT_PRIME_VIADIFF(p,d);
    v = Z_lvalrem_stop(n, p, &stop);
    if (v)
    {
      long i;
      GEN pk = powuu(p,k), m1 = addsi(1,pk);
      for (i = 1; i < v; i++) m1 = addsi(1, mulii(pk,m1));
      m = mulii(m1,m);
    }
    if (stop)
    {
      if (!is_pm1(n)) m = mulii(m, addsi(1, powiu(n,k)));
      goto fin;
    }
  }
  if (BSW_psp(n)) { m = mulii(m, addsi(1, powiu(n,k))); goto fin; }
  m = mulii(m, ifac_sumdivk(n, k, decomp_default_hint));
 fin:
  if (k1 < 0) m = gdiv(m, powiu(n1,k));
  return gerepileupto(av,m);
}

/***********************************************************************/
/**                                                                   **/
/**                MISCELLANEOUS ARITHMETIC FUNCTIONS                 **/
/**         (all of these ultimately depend on auxdecomp())           **/
/**                                                                   **/
/***********************************************************************/

GEN
divisors(GEN n)
{
  pari_sp av = avma;
  long i, j, l, tn = typ(n);
  ulong nbdiv;
  int isint = 1;
  GEN *d, *t, *t1, *t2, *t3, P, E, e;

  if (tn == t_MAT && lg(n) == 3)
  {
    P = gel(n,1); l = lg(P);
    for (i = 1; i < l; i++)
      if (typ(gel(P,i)) != t_INT) { isint = 0; break; }
  }
  else
  {
    if (tn == t_INT)
      n = auxdecomp(n,1);
    else {
      if (is_matvec_t(tn)) pari_err(typeer,"divisors");
      isint = 0;
      n = factor(n);
    }
    P = gel(n,1); l = lg(P);
  }
  E = gel(n,2);
  if (isint && l>1 && signe(P[1]) < 0) { E++; P++; l--; } /* skip -1 */
  e = cgetg(l, t_VECSMALL);
  nbdiv = 1;
  for (i=1; i<l; i++)
  {
    e[i] = itos(gel(E,i));
    if (e[i] < 0) pari_err(talker, "denominators not allowed in divisors");
    nbdiv = itou_or_0( muluu(nbdiv, 1+e[i]) );
  }
  if (!nbdiv || nbdiv & ~LGBITS)
    pari_err(talker, "too many divisors (more than %ld)", LGBITS - 1);
  d = t = (GEN*) cgetg(nbdiv+1,t_VEC);
  *++d = gen_1;
  if (isint)
  {
    for (i=1; i<l; i++)
      for (t1=t,j=e[i]; j; j--,t1=t2)
        for (t2=d, t3=t1; t3<t2; ) *++d = mulii(*++t3, gel(P,i));
    e = sort((GEN)t);
  } else {
    for (i=1; i<l; i++)
      for (t1=t,j=e[i]; j; j--,t1=t2)
        for (t2=d, t3=t1; t3<t2; ) *++d = gmul(*++t3, gel(P,i));
    e = (GEN)t;
  }
  return gerepileupto(av, e);
}

GEN
corepartial(GEN n, long all)
{
  pari_sp av = avma;
  long i;
  GEN fa, P, E, c = gen_1;

  fa = auxdecomp(n,all);
  P = gel(fa,1);
  E = gel(fa,2);
  for (i=1; i<lg(P); i++)
    if (mod2(gel(E,i))) c = mulii(c,gel(P,i));
  return gerepileuptoint(av, c);
}

GEN
core2partial(GEN n, long all)
{
  pari_sp av = avma;
  long i;
  GEN fa, P, E, c = gen_1, f = gen_1;

  fa = auxdecomp(n,all);
  P = gel(fa,1);
  E = gel(fa,2);
  for (i=1; i<lg(P); i++)
  {
    long e = itos(gel(E,i));
    if (e & 1)  c = mulii(c, gel(P,i));
    if (e != 1) f = mulii(f, gpowgs(gel(P,i), e >> 1));
  }
  return gerepilecopy(av, mkvec2(c,f));
}

GEN core(GEN n)  { return corepartial(n,1); }
GEN core2(GEN n) { return core2partial(n,1); }

GEN
core0(GEN n,long flag) { return flag? core2(n): core(n); }

static long
_mod4(GEN c) { long r = mod4(c); if (signe(c) < 0) r = 4-r; return r; }

GEN
coredisc(GEN n)
{
  pari_sp av = avma;
  GEN c = core(n);
  if (_mod4(c)==1) return c;
  return gerepileuptoint(av, shifti(c,2));
}

GEN
coredisc2(GEN n)
{
  pari_sp av = avma;
  GEN y = core2(n);
  GEN c = gel(y,1), f = gel(y,2);
  if (_mod4(c)==1) return y;
  y = cgetg(3,t_VEC);
  gel(y,1) = shifti(c,2);
  gel(y,2) = gmul2n(f,-1); return gerepileupto(av, y);
}

GEN
coredisc0(GEN n,long flag) { return flag? coredisc2(n): coredisc(n); }

/*********************************************************************/
/**                                                                 **/
/**                       BINARY DECOMPOSITION                      **/
/**                                                                 **/
/*********************************************************************/

INLINE GEN
inegate(GEN z) { return subsi(-1,z); }

GEN
binaire(GEN x)
{
  ulong m,u;
  long i,lx,ex,ly,tx=typ(x);
  GEN y,p1,p2;

  switch(tx)
  {
    case t_INT:
    {
      GEN xp=int_MSW(x);
      lx=lgefint(x);
      if (lx==2) return mkvec(gen_0);
      ly = BITS_IN_LONG+1; m=HIGHBIT; u=*xp;
      while (!(m & u)) { m>>=1; ly--; }
      y = cgetg(ly+((lx-3)<<TWOPOTBITS_IN_LONG),t_VEC); ly=1;
      do { gel(y,ly) = m & u ? gen_1 : gen_0; ly++; } while (m>>=1);
      for (i=3; i<lx; i++)
      {
        m=HIGHBIT; xp=int_precW(xp); u=*xp;
        do { gel(y,ly) = m & u ? gen_1 : gen_0; ly++; } while (m>>=1);
      }
      break;
    }
    case t_REAL:
      ex=expo(x);
      if (!signe(x))
      {
        lx=1+max(-ex,0); y=cgetg(lx,t_VEC);
        for (i=1; i<lx; i++) gel(y,i) = gen_0;
        return y;
      }

      lx=lg(x); y=cgetg(3,t_VEC);
      if (ex > bit_accuracy(lx)) pari_err(precer,"binary");
      p1 = cgetg(max(ex,0)+2,t_VEC);
      p2 = cgetg(bit_accuracy(lx)-ex,t_VEC);
      gel(y,1) = p1;
      gel(y,2) = p2;
      ly = -ex; ex++; m = HIGHBIT;
      if (ex<=0)
      {
        gel(p1,1) = gen_0; for (i=1; i <= -ex; i++) gel(p2,i) = gen_0;
        i=2;
      }
      else
      {
        ly=1;
        for (i=2; i<lx && ly<=ex; i++)
        {
          m=HIGHBIT; u=x[i];
          do
            { gel(p1,ly) = (m & u) ? gen_1 : gen_0; ly++; }
          while ((m>>=1) && ly<=ex);
        }
        ly=1;
        if (m) i--; else m=HIGHBIT;
      }
      for (; i<lx; i++)
      {
        u=x[i];
        do { gel(p2,ly) = m & u ? gen_1 : gen_0; ly++; } while (m>>=1);
        m=HIGHBIT;
      }
      break;

    case t_VEC: case t_COL: case t_MAT:
      lx=lg(x); y=cgetg(lx,tx);
      for (i=1; i<lx; i++) gel(y,i) = binaire(gel(x,i));
      break;
    default: pari_err(typeer,"binary");
      return NULL; /* not reached */
  }
  return y;
}

/* return 1 if bit n of x is set, 0 otherwise */
long
bittest(GEN x, long n)
{
  ulong u;
  long l;

  if (!signe(x) || n < 0) return 0;
  if (signe(x) < 0)
  {
    pari_sp ltop=avma;
    long b = !bittest(inegate(x),n);
    avma=ltop;
    return b;
  }
  l = n>>TWOPOTBITS_IN_LONG;
  if (l+3 > lgefint(x)) return 0;
  u = (1UL << (n & (BITS_IN_LONG-1))) & *int_W(x,l);
  return u? 1: 0;
}

GEN
gbittest(GEN x, GEN n)
{
  return arith_proto2gs(bittest,x,itos(n));
}

/***********************************************************************/
/**                                                                   **/
/**                          BITMAP OPS                               **/
/** x & y (and), x | y (or), x ^ y (xor), ~x (neg), x & ~y (negimply) **/
/**                                                                   **/
/***********************************************************************/
/* Truncate a non-negative integer to a number of bits.  */
static GEN
ibittrunc(GEN x, long bits)
{
  long known_zero_words, xl = lgefint(x) - 2;
  long len_out = ((bits + BITS_IN_LONG - 1) >> TWOPOTBITS_IN_LONG);

  if (xl < len_out)
      return x;
      /* Check whether mask is trivial */
  if (!(bits & (BITS_IN_LONG - 1))) {
      if (xl == len_out)
          return x;
  } else if (len_out <= xl) {
    GEN xi = int_W(x, len_out-1);
    /* Non-trival mask is given by a formula, if x is not
       normalized, this works even in the exceptional case */
    *xi = *xi & ((1 << (bits & (BITS_IN_LONG - 1))) - 1);
    if (*xi && xl == len_out) return x;
  }
  /* Normalize */
  known_zero_words = xl - len_out;
  if (known_zero_words < 0) known_zero_words = 0;
  return int_normalize(x, known_zero_words);
}

GEN
gbitneg(GEN x, long bits)
{
  const ulong uzero = 0;
  long xl, len_out, i;

  if (typ(x) != t_INT)
      pari_err(typeer, "bitwise negation");
  if (bits < -1)
      pari_err(talker, "negative exponent in bitwise negation");
  if (bits == -1) return inegate(x);
  if (bits == 0) return gen_0;
  if (signe(x) < 0) { /* Consider as if mod big power of 2 */
    pari_sp ltop = avma;
    return gerepileuptoint(ltop, ibittrunc(inegate(x), bits));
  }
  xl = lgefint(x);
  len_out = ((bits + BITS_IN_LONG - 1) >> TWOPOTBITS_IN_LONG) + 2;
  if (len_out > xl) { /* Need to grow */
    GEN out, outp, xp = int_MSW(x);
    out = cgeti(len_out); out[1] = evalsigne(1) | evallgefint(len_out);
    outp = int_MSW(out);
    if (!(bits & (BITS_IN_LONG - 1)))
      *outp = ~uzero;
    else
      *outp = (1 << (bits & (BITS_IN_LONG - 1))) - 1;
    for (i = 3; i < len_out - xl + 2; i++)
    {
      outp = int_precW(outp); *outp = ~uzero;
    }
    for (     ; i < len_out; i++)
    {
      outp = int_precW(outp); *outp = ~*xp;
      xp   = int_precW(xp);
    }
    return out;
  }
  x = icopy(x);
  for (i = 2; i < xl; i++) x[i] = ~x[i];
  return ibittrunc(int_normalize(x,0), bits);
}

/* bitwise 'and' of two positive integers (any integers, but we ignore sign).
 * Inputs are not necessary normalized. */
GEN
ibitand(GEN x, GEN y)
{
  long lx, ly, lout;
  long *xp, *yp, *outp;
  GEN out;
  long i;

  if (!signe(x) || !signe(y)) return gen_0;
  lx=lgefint(x); ly=lgefint(y);
  lout = min(lx,ly); /* > 2 */
  xp = int_LSW(x);
  yp = int_LSW(y);
  out = cgeti(lout); out[1] = evalsigne(1) | evallgefint(lout);
  outp = int_LSW(out);
  for (i=2; i<lout; i++)
  {
    *outp = (*xp) & (*yp);
    outp  = int_nextW(outp);
    xp    = int_nextW(xp);
    yp    = int_nextW(yp);
  }
  if ( !*int_MSW(out) ) out = int_normalize(out, 1);
  return out;
}

/* bitwise 'or' of absolute values of two integers */
GEN
ibitor(GEN x, GEN y)
{
  long lx, ly;
  long *xp, *yp, *outp;
  GEN  out;
  long i;
  if (!signe(x)) return absi(y);
  if (!signe(y)) return absi(x);

  lx = lgefint(x); xp = int_LSW(x);
  ly = lgefint(y); yp = int_LSW(y);
  if (lx < ly) swapspec(xp,yp,lx,ly);
  /* lx > 2 */
  out = cgeti(lx); out[1] = evalsigne(1) | evallgefint(lx);
  outp = int_LSW(out);
  for (i=2;i<ly;i++)
  {
    *outp = (*xp) | (*yp);
    outp  = int_nextW(outp);
    xp    = int_nextW(xp);
    yp    = int_nextW(yp);
  }
  for (   ;i<lx;i++)
  {
    *outp = *xp;
    outp  = int_nextW(outp);
    xp    = int_nextW(xp);
  }
  /* If input is normalized, this is not needed */
  if ( !*int_MSW(out) ) out = int_normalize(out, 1);
  return out;
}

/* bitwise 'xor' of absolute values of two integers */
GEN
ibitxor(GEN x, GEN y)
{
  long lx, ly;
  long *xp, *yp, *outp;
  GEN  out;
  long i;
  if (!signe(x)) return absi(y);
  if (!signe(y)) return absi(x);

  lx = lgefint(x); xp = int_LSW(x);
  ly = lgefint(y); yp = int_LSW(y);
  if (lx < ly) swapspec(xp,yp,lx,ly);
  /* lx > 2 */
  out = cgeti(lx); out[1] = evalsigne(1) | evallgefint(lx);
  outp = int_LSW(out);
  for (i=2;i<ly;i++)
  {
    *outp = (*xp) ^ (*yp);
    outp  = int_nextW(outp);
    xp    = int_nextW(xp);
    yp    = int_nextW(yp);
  }
  for (   ;i<lx;i++)
  {
    *outp = *xp;
    outp  = int_nextW(outp);
    xp    = int_nextW(xp);
  }
  if ( !*int_MSW(out) ) out = int_normalize(out, 1);
  return out;
}

/* bitwise 'negimply' of absolute values of two integers */
/* "negimply(x,y)" is ~(x => y) == ~(~x | y) == x & ~y   */
GEN
ibitnegimply(GEN x, GEN y)
{
  long lx, ly, lout, lin;
  long *xp, *yp, *outp;
  GEN out;
  long i;
  if (!signe(x)) return gen_0;
  if (!signe(y)) return absi(x);

  lx = lgefint(x); xp = int_LSW(x);
  ly = lgefint(y); yp = int_LSW(y);
  if (lx < ly) { lin = lx; lout = ly; } else { lin = ly; lout = lx; }
  /* lout > 2 */
  out = cgeti(lout); out[1] = evalsigne(1) | evallgefint(lout);
  outp = int_LSW(out);
  for (i=2; i<lin; i++)
  {
    *outp = (*xp) & ~(*yp);
    outp  = int_nextW(outp);
    xp    = int_nextW(xp);
    yp    = int_nextW(yp);
  }
  for (   ;i<lx;i++)
  {
    *outp = *xp;
    outp  = int_nextW(outp);
    xp    = int_nextW(xp);
  }
  for (   ;i<ly;i++)
  {
    *outp = ~(*yp);
    outp  = int_nextW(outp);
    yp    = int_nextW(yp);
  }
  if ( !*int_MSW(out) ) out = int_normalize(out, 1);
  return out;
}

#define signs(x,y) (((signe(x) >= 0) << 1) | (signe(y) >= 0))

GEN
gbitor(GEN x, GEN y)
{
  pari_sp ltop = avma;
  GEN z;

  if (typ(x) != t_INT || typ(y) != t_INT) pari_err(typeer, "bitwise or");
  switch (signs(x, y))
  {
    case 3: /*1,1*/
      return ibitor(x,y);
    case 2: /*1,-1*/
      z = ibitnegimply(inegate(y),x);
      break;
    case 1: /*-1,1*/
      z = ibitnegimply(inegate(x),y);
      break;
    case 0: /*-1,-1*/
      z = ibitand(inegate(x),inegate(y));
      break;
    default: return NULL;
  }
  return gerepileuptoint(ltop, inegate(z));
}

GEN
gbitand(GEN x, GEN y)
{
  pari_sp ltop = avma;
  GEN z;

  if (typ(x) != t_INT || typ(y) != t_INT) pari_err(typeer, "bitwise and");
  switch (signs(x, y))
  {
    case 3: /*1,1*/
      return ibitand(x,y);
    case 2: /*1,-1*/
      z = ibitnegimply(x,inegate(y));
      break;
    case 1: /*-1,1*/
      z = ibitnegimply(y,inegate(x));
      break;
    case 0: /*-1,-1*/
      z = inegate(ibitor(inegate(x),inegate(y)));
      break;
    default: return NULL;
  }
  return gerepileuptoint(ltop, z);
}

GEN
gbitxor(GEN x, GEN y)
{
  pari_sp ltop = avma;
  GEN z;

  if (typ(x) != t_INT || typ(y) != t_INT) pari_err(typeer, "bitwise xor");
  switch (signs(x, y))
  {
    case 3: /*1,1*/
      return ibitxor(x,y);
    case 2: /*1,-1*/
      z = inegate(ibitxor(x,inegate(y)));
      break;
    case 1: /*-1,1*/
      z = inegate(ibitxor(inegate(x),y));
      break;
    case 0: /*-1,-1*/
      z = ibitxor(inegate(x),inegate(y));
      break;
    default: return NULL;
  }
  return gerepileuptoint(ltop,z);
}

/* x & ~y */
GEN
gbitnegimply(GEN x, GEN y)
{
  pari_sp ltop = avma;
  GEN z;

  if (typ(x) != t_INT || typ(y) != t_INT) pari_err(typeer, "bitwise negated imply");
  switch (signs(x, y))
  {
    case 3: /*1,1*/
      return ibitnegimply(x,y);
    case 2: /*1,-1*/
      z = ibitand(x,inegate(y));
      break;
    case 1: /*-1,1*/
      z = inegate(ibitor(y,inegate(x)));
      break;
    case 0: /*-1,-1*/
      z = ibitnegimply(inegate(y),inegate(x));
      break;
    default: return NULL;
  }
  return gerepileuptoint(ltop,z);
}