#include <NTL/GF2X.h>
#include <NTL/vec_long.h>

#include <NTL/new.h>

NTL_START_IMPL

/********** data structures for accesss to GF2XRegisters ************/

static GF2X GF2XRegisterVec[32];
static long GF2XRegisterTop = 0;


class GF2XRegisterType {
public:

GF2X *xrep;

GF2XRegisterType()
{ xrep = &GF2XRegisterVec[GF2XRegisterTop]; GF2XRegisterTop++; }

~GF2XRegisterType()
{ GF2XRegisterTop--; }

operator GF2X& () { return *xrep; }

};

#define GF2XRegister(a) GF2XRegisterType GF2XReg__ ## a ; GF2X& a = GF2XReg__ ## a







static vec_GF2X stab;  // used by PlainDivRem and PlainRem

static WordVector GF2X_rembuf;


void PlainDivRem(GF2X& q, GF2X& r, const GF2X& a, const GF2X& b)
{
   long da, sa, posa, db, sb, posb, dq, sq, posq;

   da = deg(a);
   db = deg(b);

   if (db < 0) Error("GF2X: division by zero");

   if (da < db) {
      r = a;
      clear(q);
      return;
   }

   sa = a.xrep.length();
   posa = da - NTL_BITS_PER_LONG*(sa-1);
   sb = b.xrep.length();
   posb = db - NTL_BITS_PER_LONG*(sb-1);

   dq = da - db;
   sq = dq/NTL_BITS_PER_LONG + 1;
   posq = dq - NTL_BITS_PER_LONG*(sq-1);

   _ntl_ulong *ap;
   if (&r == &a)
      ap = r.xrep.elts();
   else {
      GF2X_rembuf = a.xrep;
      ap = GF2X_rembuf.elts();
   }

   stab.SetLength(NTL_BITS_PER_LONG);
   long i;

   stab[posb] = b;
   for (i = 1; i <= min(dq, NTL_BITS_PER_LONG-1); i++) 
      MulByX(stab[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG], 
             stab[((_ntl_ulong)(posb+i-1))%NTL_BITS_PER_LONG]);

   _ntl_ulong *stab_ptr[NTL_BITS_PER_LONG];
   long stab_cnt[NTL_BITS_PER_LONG];

   for (i = 0; i <= min(dq, NTL_BITS_PER_LONG-1); i++) {
      WordVector& st = stab[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG].xrep;
      long k = st.length();
      stab_ptr[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG] = &st[k-1];
      stab_cnt[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG] = -k+1;
   }

   q.xrep.SetLength(sq);
   _ntl_ulong *qp = q.xrep.elts();
   for (i = 0; i < sq; i++)
      qp[i] = 0;

   _ntl_ulong *atop = &ap[sa-1];
   _ntl_ulong *qtop = &qp[sq-1];
   _ntl_ulong *stab_top;

   while (1) {
      if (atop[0] & (1UL << posa)) {
         qtop[0] |= (1UL << posq);
         stab_top = stab_ptr[posa];
         for (i = stab_cnt[posa]; i <= 0; i++)
            atop[i] ^= stab_top[i];
      }

      da--;
      if (da < db) break;

      posa--;
      if (posa < 0) {
         posa = NTL_BITS_PER_LONG-1;
         atop--;
      }

      posq--;
      if (posq < 0) {
         posq = NTL_BITS_PER_LONG-1;
         qtop--;
      }
   }

   if (posb == 0) sb--;

   r.xrep.SetLength(sb);
   if (&r != &a) {
      _ntl_ulong *rp = r.xrep.elts();
      for (i = 0; i < sb; i++)
         rp[i] = ap[i];
   }
   r.normalize();
}



void PlainDiv(GF2X& q, const GF2X& a, const GF2X& b)
{
   GF2XRegister(r);
   PlainDivRem(q, r, a, b);
}


void PlainRem(GF2X& r, const GF2X& a, const GF2X& b)
{
   long da, sa, posa, db, sb, posb;

   da = deg(a);
   db = deg(b);

   if (db < 0) Error("GF2X: division by zero");

   if (da < db) {
      r = a;
      return;
   }

   sa = a.xrep.length();
   posa = da - NTL_BITS_PER_LONG*(sa-1);
   sb = b.xrep.length();
   posb = db - NTL_BITS_PER_LONG*(sb-1);

   _ntl_ulong *ap;
   if (&r == &a)
      ap = r.xrep.elts();
   else {
      GF2X_rembuf = a.xrep;
      ap = GF2X_rembuf.elts();
   }

   stab.SetLength(NTL_BITS_PER_LONG);
   long i;

   stab[posb] = b;
   for (i = 1; i <= min(da-db, NTL_BITS_PER_LONG-1); i++) 
      MulByX(stab[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG], 
             stab[((_ntl_ulong)(posb+i-1))%NTL_BITS_PER_LONG]);

   _ntl_ulong *stab_ptr[NTL_BITS_PER_LONG];
   long stab_cnt[NTL_BITS_PER_LONG];

   for (i = 0; i <= min(da-db, NTL_BITS_PER_LONG-1); i++) {
      WordVector& st = stab[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG].xrep;
      long k = st.length();
      stab_ptr[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG] = &st[k-1];
      stab_cnt[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG] = -k+1;
   }


   _ntl_ulong *atop = &ap[sa-1];
   _ntl_ulong *stab_top;

   while (1) {
      if (atop[0] & (1UL << posa)) {
         stab_top = stab_ptr[posa];
         for (i = stab_cnt[posa]; i <= 0; i++)
            atop[i] ^= stab_top[i];
      }

      da--;
      if (da < db) break;

      posa--;
      if (posa < 0) {
         posa = NTL_BITS_PER_LONG-1;
         atop--;
      }
   }

   if (posb == 0) sb--;

   r.xrep.SetLength(sb);
   if (&r != &a) {
      _ntl_ulong *rp = r.xrep.elts();
      for (i = 0; i < sb; i++)
         rp[i] = ap[i];
   }
   r.normalize();
}

#define MASK8 ((1UL << 8)-1UL)

static _ntl_ulong invtab[128] = {
1UL, 255UL, 85UL, 219UL, 73UL, 151UL, 157UL, 51UL, 17UL, 175UL,
69UL, 139UL, 89UL, 199UL, 141UL, 99UL, 33UL, 95UL, 117UL, 123UL,
105UL, 55UL, 189UL, 147UL, 49UL, 15UL, 101UL, 43UL, 121UL, 103UL,
173UL, 195UL, 65UL, 191UL, 21UL, 155UL, 9UL, 215UL, 221UL, 115UL,
81UL, 239UL, 5UL, 203UL, 25UL, 135UL, 205UL, 35UL, 97UL, 31UL,
53UL, 59UL, 41UL, 119UL, 253UL, 211UL, 113UL, 79UL, 37UL, 107UL,
57UL, 39UL, 237UL, 131UL, 129UL, 127UL, 213UL, 91UL, 201UL, 23UL,
29UL, 179UL, 145UL, 47UL, 197UL, 11UL, 217UL, 71UL, 13UL, 227UL,
161UL, 223UL, 245UL, 251UL, 233UL, 183UL, 61UL, 19UL, 177UL, 143UL,
229UL, 171UL, 249UL, 231UL, 45UL, 67UL, 193UL, 63UL, 149UL, 27UL,
137UL, 87UL, 93UL, 243UL, 209UL, 111UL, 133UL, 75UL, 153UL, 7UL,
77UL, 163UL, 225UL, 159UL, 181UL, 187UL, 169UL, 247UL, 125UL, 83UL,
241UL, 207UL, 165UL, 235UL, 185UL, 167UL, 109UL, 3UL };



void NewtonInvTrunc(GF2X& c, const GF2X& a, long e)
{
   if (e == 1) {
      set(c);
      return;
   }

   static vec_long E;
   E.SetLength(0);
   append(E, e);
   while (e > 8) {
      e = (e+1)/2;
      append(E, e);
   }

   long L = E.length();

   GF2XRegister(g);
   GF2XRegister(g0);
   GF2XRegister(g1);
   GF2XRegister(g2);

   g.xrep.SetMaxLength((E[0]+NTL_BITS_PER_LONG-1)/NTL_BITS_PER_LONG + 1);
   g0.xrep.SetMaxLength((E[0]+NTL_BITS_PER_LONG-1)/NTL_BITS_PER_LONG + 1);
   g1.xrep.SetMaxLength(((3*E[0]+1)/2+NTL_BITS_PER_LONG-1)/NTL_BITS_PER_LONG+1);
   g2.xrep.SetMaxLength((E[0]+NTL_BITS_PER_LONG-1)/NTL_BITS_PER_LONG + 1);

   g.xrep.SetLength(1);
   g.xrep[0] = invtab[(a.xrep[0] & MASK8) >> 1] & ((1UL<<e)-1UL);

   long i;

   for (i = L-1; i > 0; i--) {
      // lift from E[i] to E[i-1]

      long k = E[i];
      long l = E[i-1]-E[i];

      trunc(g0, a, k+l);

      mul(g1, g0, g);
      RightShift(g1, g1, k);
      trunc(g1, g1, l);

      mul(g2, g1, g);
      trunc(g2, g2, l);
      LeftShift(g2, g2, k);

      add(g, g, g2);
   }

   c = g;
}

void InvTrunc(GF2X& c, const GF2X& a, long e)
{
   if (ConstTerm(a) == 0 || e < 0)
      Error("inv: bad args");

   if (NTL_OVERFLOW(e, 1, 0))
      Error("overflow in InvTrunc");

   if (e == 0) {
      clear(c);
      return;
   }

   NewtonInvTrunc(c, a, e);
}



static 
long weight1(_ntl_ulong a)
{
   long res = 0;
   while (a) {
      if (a & 1) res ++;
      a >>= 1;
   }
   return res;
}

long weight(const GF2X& a)
{
   long wlen = a.xrep.length();
   long res = 0;
   long i;
   for (i = 0; i < wlen; i++)
      res += weight1(a.xrep[i]);

   return res;
}



static
void SparsityCheck(const GF2X& f, long& k3, long& k2, long& k1)
{
   long w = weight(f);
   if (w != 3 && w != 5) {
      k3 = 0;
      return;
   }

   if (ConstTerm(f) != 1) {
      k3 = 0;
      return;
   }

   GF2X g = f;

   long n = deg(f);

   trunc(g, g, n);
   
   long t = deg(g);

   if (n-t < NTL_BITS_PER_LONG || t > (n+1)/2) {
      k3 = 0;
      return;
   }

   if (w == 3) {
      k3 = t;
      k2 = 0;
      return;
   }

   k3 = t;
   trunc(g, g, t);
   t = deg(g);
   k2 = t;
   trunc(g, g, t);
   t = deg(g);
   k1 = t;
}




const long GF2X_MOD_PLAIN = 0;
const long GF2X_MOD_MUL = 1;
const long GF2X_MOD_SPECIAL = 2;
const long GF2X_MOD_TRI = 3;
const long GF2X_MOD_PENT = 4;

void build(GF2XModulus& F, const GF2X& f)
{
   long n = deg(f);
   long i;

   if (n <= 0) Error("build(GF2XModulus,GF2X): deg(f) <= 0");

   F.tracevec.SetLength(0);

   F.f = f;
   F.n = n;
   F.sn = f.xrep.length();

   long sb = F.sn;
   long posb = n - NTL_BITS_PER_LONG*(sb-1);

   F.posn = posb; 

   if (F.posn > 0) {
      F.size = F.sn;
      F.msk = (1UL << F.posn) - 1UL;
   }
   else {
      F.size = F.sn-1;
      F.msk = ~0UL;
   }

   SparsityCheck(f, F.k3, F.k2, F.k1);

   if (F.k3 != 0) {
      if (F.k2 == 0)
         F.method = GF2X_MOD_TRI;
      else
         F.method = GF2X_MOD_PENT;

      return;
   }


   GF2X f0;
   trunc(f0, f, n);
   long deg_f0 = deg(f0);

   if (F.sn > 1 && deg_f0 < NTL_BITS_PER_LONG 
       && deg_f0 >= NTL_BITS_PER_LONG/2) {
      if (F.size >= 6)
         F.method = GF2X_MOD_MUL;
      else
         F.method = GF2X_MOD_SPECIAL;
   }
   else if (F.sn > 1 && deg_f0 < NTL_BITS_PER_LONG/2) {
      if (F.size >= 4)
         F.method = GF2X_MOD_MUL;
      else
         F.method = GF2X_MOD_SPECIAL;
   }
   else if (F.size >= 8)
      F.method = GF2X_MOD_MUL;
   else 
      F.method = GF2X_MOD_PLAIN;
      

   if (F.method == GF2X_MOD_SPECIAL) {
      if (!F.stab_cnt) F.stab_cnt = NTL_NEW_OP long[NTL_BITS_PER_LONG];
      long *stab_cnt = F.stab_cnt;
      if (!stab_cnt) Error("out of memory");

      if (!F.stab1) F.stab1 = NTL_NEW_OP _ntl_ulong[2*NTL_BITS_PER_LONG];
      _ntl_ulong *stab1 = F.stab1;
      if (!stab1) Error("out of memory");

      stab1[posb<<1] = f.xrep[0];
      stab1[(posb<<1)+1] = 0;

      stab_cnt[posb] = -sb+1;

      for (i = 1; i < NTL_BITS_PER_LONG; i++) {
         long kk0 = ((_ntl_ulong)(posb+i-1))%NTL_BITS_PER_LONG;
         long kk1 = ((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG;

         stab1[kk1<<1] = stab1[kk0<<1] << 1;
         stab1[(kk1<<1)+1] = (stab1[(kk0<<1)+1] << 1) 
                          | (stab1[kk0<<1] >> (NTL_BITS_PER_LONG-1));

         if (kk1 < posb) 
            stab_cnt[kk1] = -sb;
         else
            stab_cnt[kk1] = -sb+1;
      }
   }
   else if (F.method == GF2X_MOD_PLAIN) {
      vec_GF2X& stab = F.stab;
      stab.SetLength(NTL_BITS_PER_LONG);


      if (!F.stab_ptr) F.stab_ptr = NTL_NEW_OP _ntl_ulong_ptr[NTL_BITS_PER_LONG];
      _ntl_ulong **stab_ptr = F.stab_ptr;
      if (!stab_ptr) Error("out of memory");

      if (!F.stab_cnt) F.stab_cnt = NTL_NEW_OP long[NTL_BITS_PER_LONG];
      long *stab_cnt = F.stab_cnt;
      if (!stab_cnt) Error("out of memory");
      
   
      stab[posb] = f;
      for (i = 1; i < NTL_BITS_PER_LONG; i++) 
         MulByX(stab[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG], 
                stab[((_ntl_ulong)(posb+i-1))%NTL_BITS_PER_LONG]);
   
   
      for (i = 0; i < NTL_BITS_PER_LONG; i++) {
         WordVector& st = stab[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG].xrep;
         long k = st.length();
         stab_ptr[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG] = &st[k-1];
         stab_cnt[((_ntl_ulong)(posb+i))%NTL_BITS_PER_LONG] = -k+1;
      }
   }
   else if (F.method == GF2X_MOD_MUL) {
      GF2X P1, P2;

      CopyReverse(P1, f, n);
      InvTrunc(P2, P1, n-1);
      CopyReverse(P1, P2, n-2);
      trunc(F.h0, P1, n-2);
      F.f0 = f0;
   }
}

GF2XModulus::GF2XModulus()
{
   n = -1;
   method = GF2X_MOD_PLAIN;
   stab_ptr = 0;
   stab_cnt = 0;
   stab1 = 0;
}


// The following two routines are total spaghetti...unfortunately,
// cleaning them up would require too much re-coding in other
// places.

GF2XModulus::GF2XModulus(const GF2XModulus& F) :
   f(F.f), n(F.n), sn(F.sn), posn(F.posn), k3(F.k3), k2(F.k2), k1(F.k1),
   size(F.size), 
   msk(F.msk), method(F.method), stab(F.stab), h0(F.h0), f0(F.f0),
   stab_cnt(0), stab_ptr(0), stab1(0), tracevec(F.tracevec)
{
   if (method == GF2X_MOD_SPECIAL) {
      long i;
      stab1 = NTL_NEW_OP _ntl_ulong[2*NTL_BITS_PER_LONG];
      if (!stab1) Error("GF2XModulus: out of memory");
      for (i = 0; i < 2*NTL_BITS_PER_LONG; i++)
         stab1[i] = F.stab1[i];
      stab_cnt = NTL_NEW_OP long[NTL_BITS_PER_LONG];
      if (!stab_cnt) Error("GF2XModulus: out of memory");
      for (i = 0; i < NTL_BITS_PER_LONG; i++)
         stab_cnt[i] = F.stab_cnt[i];
   }
   else if (method == GF2X_MOD_PLAIN) {
      long i;

      if (F.stab_cnt) {
         stab_cnt = NTL_NEW_OP long[NTL_BITS_PER_LONG];
         if (!stab_cnt) Error("GF2XModulus: out of memory");
         for (i = 0; i < NTL_BITS_PER_LONG; i++)
            stab_cnt[i] = F.stab_cnt[i];
      }

      if (F.stab_ptr) {
         stab_ptr = NTL_NEW_OP _ntl_ulong_ptr[NTL_BITS_PER_LONG];
         if (!stab_ptr) Error("GF2XModulus: out of memory");
      
         for (i = 0; i < NTL_BITS_PER_LONG; i++) {
            WordVector& st = stab[((_ntl_ulong)(posn+i))%NTL_BITS_PER_LONG].xrep;
            long k = st.length();
            stab_ptr[((_ntl_ulong)(posn+i))%NTL_BITS_PER_LONG] = &st[k-1];
            stab_cnt[((_ntl_ulong)(posn+i))%NTL_BITS_PER_LONG] = -k+1;
         }
      }
   }
}

GF2XModulus& GF2XModulus::operator=(const GF2XModulus& F)
{
   if (this == &F) return *this;

   f=F.f; n=F.n; sn=F.sn; posn=F.posn; 
   k3=F.k3; k2=F.k2; k1=F.k1;
   size=F.size; 
   msk=F.msk; method=F.method; stab=F.stab; h0=F.h0; f0 = F.f0;
   tracevec=F.tracevec;

   if (method == GF2X_MOD_SPECIAL) {
      long i;
      if (!stab1) stab1 = NTL_NEW_OP _ntl_ulong[2*NTL_BITS_PER_LONG];
      if (!stab1) Error("GF2XModulus: out of memory");
      for (i = 0; i < 2*NTL_BITS_PER_LONG; i++)
         stab1[i] = F.stab1[i];
      if (!stab_cnt) stab_cnt = NTL_NEW_OP long[NTL_BITS_PER_LONG];
      if (!stab_cnt) Error("GF2XModulus: out of memory");
      for (i = 0; i < NTL_BITS_PER_LONG; i++)
         stab_cnt[i] = F.stab_cnt[i];
   }
   else if (method == GF2X_MOD_PLAIN) {
      long i;

      if (F.stab_cnt) {
         if (!stab_cnt) stab_cnt = NTL_NEW_OP long[NTL_BITS_PER_LONG];
         if (!stab_cnt) Error("GF2XModulus: out of memory");
         for (i = 0; i < NTL_BITS_PER_LONG; i++)
            stab_cnt[i] = F.stab_cnt[i];
      }

      if (F.stab_ptr) {
         if (!stab_ptr) stab_ptr = NTL_NEW_OP _ntl_ulong_ptr[NTL_BITS_PER_LONG];
         if (!stab_ptr) Error("GF2XModulus: out of memory");
      
         for (i = 0; i < NTL_BITS_PER_LONG; i++) {
            WordVector& st = stab[((_ntl_ulong)(posn+i))%NTL_BITS_PER_LONG].xrep;
            long k = st.length();
            stab_ptr[((_ntl_ulong)(posn+i))%NTL_BITS_PER_LONG] = &st[k-1];
            stab_cnt[((_ntl_ulong)(posn+i))%NTL_BITS_PER_LONG] = -k+1;
         }
      }
   }

   return *this;
}
   


GF2XModulus::~GF2XModulus() 
{ 
   delete [] stab_ptr; 
   delete [] stab_cnt; 
   delete [] stab1; 
}



GF2XModulus::GF2XModulus(const GF2X& ff)
{
   n = -1;
   method = GF2X_MOD_PLAIN;
   stab_ptr = 0;
   stab_cnt = 0;
   stab1 = 0;

   build(*this, ff);
}





void UseMulRem21(GF2X& r, const GF2X& a, const GF2XModulus& F)
{
   GF2XRegister(P1);
   GF2XRegister(P2);

   RightShift(P1, a, F.n);
   mul(P2, P1, F.h0);
   RightShift(P2, P2, F.n-2);
   add(P2, P2, P1);
   mul(P1, P2, F.f0);
   trunc(P1, P1, F.n);
   trunc(r, a, F.n);
   add(r, r, P1);
}

void UseMulDivRem21(GF2X& q, GF2X& r, const GF2X& a, const GF2XModulus& F)
{
   GF2XRegister(P1);
   GF2XRegister(P2);

   RightShift(P1, a, F.n);
   mul(P2, P1, F.h0);
   RightShift(P2, P2, F.n-2);
   add(P2, P2, P1);
   mul(P1, P2, F.f0);
   trunc(P1, P1, F.n);
   trunc(r, a, F.n);
   add(r, r, P1);
   q = P2;
}

void UseMulDiv21(GF2X& q, const GF2X& a, const GF2XModulus& F)
{
   GF2XRegister(P1);
   GF2XRegister(P2);

   RightShift(P1, a, F.n);
   mul(P2, P1, F.h0);
   RightShift(P2, P2, F.n-2);
   add(P2, P2, P1);
   q = P2;
}


void UseMulRemX1(GF2X& r, const GF2X& aa, const GF2XModulus& F)
{
   GF2XRegister(buf);
   GF2XRegister(tmp);
   GF2XRegister(a);

   clear(buf);
   a = aa;

   long n = F.n;
   long a_len = deg(a) + 1;

   while (a_len > 0) {
      long old_buf_len = deg(buf) + 1;
      long amt = min(2*n-1-old_buf_len, a_len);

      LeftShift(buf, buf, amt);
      RightShift(tmp, a, a_len-amt);
      add(buf, buf, tmp);
      trunc(a, a, a_len-amt);

      UseMulRem21(buf, buf, F);
      a_len -= amt;
   }

   r = buf;
}
   

void UseMulDivRemX1(GF2X& q, GF2X& r, const GF2X& aa, const GF2XModulus& F)
{
   GF2XRegister(buf);
   GF2XRegister(tmp);
   GF2XRegister(a);
   GF2XRegister(qq);
   GF2XRegister(qbuf);

   clear(buf);
   a = aa;
   clear(qq);

   long n = F.n;
   long a_len = deg(a) + 1;

   while (a_len > 0) {
      long old_buf_len = deg(buf) + 1;
      long amt = min(2*n-1-old_buf_len, a_len);

      LeftShift(buf, buf, amt);
      RightShift(tmp, a, a_len-amt);
      add(buf, buf, tmp);
      trunc(a, a, a_len-amt);

      UseMulDivRem21(qbuf, buf, buf, F);
      a_len -= amt;

      ShiftAdd(qq, qbuf, a_len);
   }

   r = buf;
   q = qq;
}


void UseMulDivX1(GF2X& q, const GF2X& aa, const GF2XModulus& F)
{
   GF2XRegister(buf);
   GF2XRegister(tmp);
   GF2XRegister(a);
   GF2XRegister(qq);
   GF2XRegister(qbuf);
   
   clear(buf);
   a = aa;
   clear(qq);

   long n = F.n;
   long a_len = deg(a) + 1;

   while (a_len > 0) {
      long old_buf_len = deg(buf) + 1;
      long amt = min(2*n-1-old_buf_len, a_len);

      LeftShift(buf, buf, amt);
      RightShift(tmp, a, a_len-amt);
      add(buf, buf, tmp);
      trunc(a, a, a_len-amt);

      UseMulDivRem21(qbuf, buf, buf, F);
      a_len -= amt;

      ShiftAdd(qq, qbuf, a_len);
   }

   q = qq;
}

static
void TrinomReduce(GF2X& x, const GF2X& a, long n, long k)
{
   long wn = n / NTL_BITS_PER_LONG;
   long bn = n - wn*NTL_BITS_PER_LONG;

   long wdiff = (n-k)/NTL_BITS_PER_LONG;
   long bdiff = (n-k) - wdiff*NTL_BITS_PER_LONG;

   long m = a.xrep.length()-1;

   if (wn > m) {
      x = a;
      return;
   }

   GF2XRegister(r);

   r = a;

   _ntl_ulong *p = r.xrep.elts();

   _ntl_ulong *pp;


   _ntl_ulong w;

   if (bn == 0) {
      if (bdiff == 0) {
         // bn == 0 && bdiff == 0

         while (m >= wn) {
            w = p[m];
            p[m-wdiff] ^= w;
            p[m-wn] ^= w;
            m--;
         }
      }
      else {
         // bn == 0 && bdiff != 0

         while (m >= wn) {
            w = p[m];
            pp = &p[m-wdiff];
            *pp ^= (w >> bdiff);
            *(pp-1) ^= (w << (NTL_BITS_PER_LONG-bdiff));
            p[m-wn] ^= w;
            m--;
         }
      }
   }
   else {
      if (bdiff == 0) {
         // bn != 0 && bdiff == 0

         while (m > wn) {
            w = p[m];
            p[m-wdiff] ^= w;
            pp = &p[m-wn];
            *pp ^= (w >> bn);
            *(pp-1) ^= (w << (NTL_BITS_PER_LONG-bn));
            m--;
         }

         w = (p[m] >> bn) << bn;;

         p[m-wdiff] ^= w;
         p[0] ^= (w >> bn);

         p[m] &= ((1UL<<bn)-1UL); 
      }
      else {
         // bn != 0 && bdiff != 0

         while (m > wn) {
            w = p[m];
            pp = &p[m-wdiff];
            *pp ^= (w >> bdiff);;
            *(pp-1) ^= (w << (NTL_BITS_PER_LONG-bdiff));
            pp = &p[m-wn];
            *pp ^= (w >> bn);
            *(pp-1) ^= (w << (NTL_BITS_PER_LONG-bn));
            m--;
         }

         w = (p[m] >> bn) << bn;;

         p[m-wdiff] ^= (w >> bdiff);
         if (m-wdiff-1 >= 0) p[m-wdiff-1] ^= (w << (NTL_BITS_PER_LONG-bdiff));
         p[0] ^= (w >> bn);
         p[m] &= ((1UL<<bn)-1UL); 
      }
   }

   if (bn == 0)
      wn--;

   while (wn >= 0 && p[wn] == 0)
      wn--;

   r.xrep.QuickSetLength(wn+1);

   x = r;
}

static
void PentReduce(GF2X& x, const GF2X& a, long n, long k3, long k2, long k1)
{
   long wn = n / NTL_BITS_PER_LONG;
   long bn = n - wn*NTL_BITS_PER_LONG;

   long m = a.xrep.length()-1;

   if (wn > m) {
      x = a;
      return;
   }

   long wdiff1 = (n-k1)/NTL_BITS_PER_LONG;
   long bdiff1 = (n-k1) - wdiff1*NTL_BITS_PER_LONG;

   long wdiff2 = (n-k2)/NTL_BITS_PER_LONG;
   long bdiff2 = (n-k2) - wdiff2*NTL_BITS_PER_LONG;

   long wdiff3 = (n-k3)/NTL_BITS_PER_LONG;
   long bdiff3 = (n-k3) - wdiff3*NTL_BITS_PER_LONG;

   static GF2X r;
   r = a;

   _ntl_ulong *p = r.xrep.elts();

   _ntl_ulong *pp;

   _ntl_ulong w;

   while (m > wn) {
      w = p[m];

      if (bn == 0) 
         p[m-wn] ^= w;
      else {
         pp = &p[m-wn];
         *pp ^= (w >> bn);
         *(pp-1) ^= (w << (NTL_BITS_PER_LONG-bn));
      }

      if (bdiff1 == 0) 
         p[m-wdiff1] ^= w;
      else {
         pp = &p[m-wdiff1];
         *pp ^= (w >> bdiff1);
         *(pp-1) ^= (w << (NTL_BITS_PER_LONG-bdiff1));
      }

      if (bdiff2 == 0) 
         p[m-wdiff2] ^= w;
      else {
         pp = &p[m-wdiff2];
         *pp ^= (w >> bdiff2);
         *(pp-1) ^= (w << (NTL_BITS_PER_LONG-bdiff2));
      }

      if (bdiff3 == 0) 
         p[m-wdiff3] ^= w;
      else {
         pp = &p[m-wdiff3];
         *pp ^= (w >> bdiff3);
         *(pp-1) ^= (w << (NTL_BITS_PER_LONG-bdiff3));
      }

      m--;
   }

   w = (p[m] >> bn) << bn;

   p[0] ^= (w >> bn); 

   if (bdiff1 == 0)
      p[m-wdiff1] ^= w;
   else {
      p[m-wdiff1] ^= (w >> bdiff1);
      if (m-wdiff1-1 >= 0) p[m-wdiff1-1] ^= (w << (NTL_BITS_PER_LONG-bdiff1));
   }

   if (bdiff2 == 0)
      p[m-wdiff2] ^= w;
   else {
      p[m-wdiff2] ^= (w >> bdiff2);
      if (m-wdiff2-1 >= 0) p[m-wdiff2-1] ^= (w << (NTL_BITS_PER_LONG-bdiff2));
   }

   if (bdiff3 == 0)
      p[m-wdiff3] ^= w;
   else {
      p[m-wdiff3] ^= (w >> bdiff3);
      if (m-wdiff3-1 >= 0) p[m-wdiff3-1] ^= (w << (NTL_BITS_PER_LONG-bdiff3));
   }

   if (bn != 0)
      p[m] &= ((1UL<<bn)-1UL);

   
   if (bn == 0)
      wn--;

   while (wn >= 0 && p[wn] == 0)
      wn--;

   r.xrep.QuickSetLength(wn+1);

   x = r;
}




static
void RightShiftAdd(GF2X& c, const GF2X& a, long n)
{
   if (n < 0) {
      Error("RightShiftAdd: negative shamt");
   }

   if (n == 0) {
      add(c, c, a);
      return;
   }

   long sa = a.xrep.length();
   long wn = n/NTL_BITS_PER_LONG;
   long bn = n - wn*NTL_BITS_PER_LONG;

   if (wn >= sa) {
      return;
   }

   long sc = c.xrep.length();
   long i;

   if (sa-wn > sc)
      c.xrep.SetLength(sa-wn);

   _ntl_ulong *cp = c.xrep.elts();
   const _ntl_ulong *ap = a.xrep.elts();

   for (i = sc; i < sa-wn; i++)
      cp[i] = 0;


   if (bn == 0) {
      for (i = 0; i < sa-wn; i++)
         cp[i] ^= ap[i+wn];
   }
   else {
      for (i = 0; i < sa-wn-1; i++)
         cp[i] ^= (ap[i+wn] >> bn) | (ap[i+wn+1] << (NTL_BITS_PER_LONG - bn));

      cp[sa-wn-1] ^= ap[sa-1] >> bn;
   }

   c.normalize();
}


static
void TriDiv21(GF2X& q, const GF2X& a, long n, long k)
{
   GF2XRegister(P1);

   RightShift(P1, a, n);
   if (k != 1) 
      RightShiftAdd(P1, P1, n-k);

   q = P1;
}

static 
void TriDivRem21(GF2X& q, GF2X& r, const GF2X& a, long n, long k)
{
   GF2XRegister(Q);
   TriDiv21(Q, a, n, k);
   TrinomReduce(r, a, n, k);
   q = Q;
}


static
void PentDiv21(GF2X& q, const GF2X& a, long n, long k3, long k2, long k1)
{
   if (deg(a) < n) {
      clear(q);
      return;
   }

   GF2XRegister(P1);
   GF2XRegister(P2);

   RightShift(P1, a, n);
   
   RightShift(P2, P1, n-k3);
   RightShiftAdd(P2, P1, n-k2);
   if (k1 != 1) {
      RightShiftAdd(P2, P1, n-k1);
   }

   add(P2, P2, P1);

   q = P2;
}

static 
void PentDivRem21(GF2X& q, GF2X& r, const GF2X& a, long n, 
                  long k3, long k2, long k1)
{
   GF2XRegister(Q);
   PentDiv21(Q, a, n, k3, k2, k1);
   PentReduce(r, a, n, k3, k2, k1);
   q = Q;
}

static
void TriDivRemX1(GF2X& q, GF2X& r, const GF2X& aa, long n, long k)
{
   GF2XRegister(buf);
   GF2XRegister(tmp);
   GF2XRegister(a);
   GF2XRegister(qq);
   GF2XRegister(qbuf);

   clear(buf);
   a = aa;
   clear(qq);

   long a_len = deg(a) + 1;

   while (a_len > 0) {
      long old_buf_len = deg(buf) + 1;
      long amt = min(2*n-1-old_buf_len, a_len);

      LeftShift(buf, buf, amt);
      RightShift(tmp, a, a_len-amt);
      add(buf, buf, tmp);
      trunc(a, a, a_len-amt);

      TriDivRem21(qbuf, buf, buf, n, k);
      a_len -= amt;

      ShiftAdd(qq, qbuf, a_len);
   }

   r = buf;
   q = qq;
}


static
void TriDivX1(GF2X& q, const GF2X& aa, long n, long k)
{
   GF2XRegister(buf);
   GF2XRegister(tmp);
   GF2XRegister(a);
   GF2XRegister(qq);
   GF2XRegister(qbuf);
   
   clear(buf);
   a = aa;
   clear(qq);

   long a_len = deg(a) + 1;

   while (a_len > 0) {
      long old_buf_len = deg(buf) + 1;
      long amt = min(2*n-1-old_buf_len, a_len);

      LeftShift(buf, buf, amt);
      RightShift(tmp, a, a_len-amt);
      add(buf, buf, tmp);
      trunc(a, a, a_len-amt);

      TriDivRem21(qbuf, buf, buf, n, k);
      a_len -= amt;

      ShiftAdd(qq, qbuf, a_len);
   }

   q = qq;
}

static
void PentDivRemX1(GF2X& q, GF2X& r, const GF2X& aa, long n, 
                  long k3, long k2, long k1)
{
   GF2XRegister(buf);
   GF2XRegister(tmp);
   GF2XRegister(a);
   GF2XRegister(qq);
   GF2XRegister(qbuf);

   clear(buf);
   a = aa;
   clear(qq);

   long a_len = deg(a) + 1;

   while (a_len > 0) {
      long old_buf_len = deg(buf) + 1;
      long amt = min(2*n-1-old_buf_len, a_len);

      LeftShift(buf, buf, amt);
      RightShift(tmp, a, a_len-amt);
      add(buf, buf, tmp);
      trunc(a, a, a_len-amt);

      PentDivRem21(qbuf, buf, buf, n, k3, k2, k1);
      a_len -= amt;

      ShiftAdd(qq, qbuf, a_len);
   }

   r = buf;
   q = qq;
}


static
void PentDivX1(GF2X& q, const GF2X& aa, long n, long k3, long k2, long k1)
{
   GF2XRegister(buf);
   GF2XRegister(tmp);
   GF2XRegister(a);
   GF2XRegister(qq);
   GF2XRegister(qbuf);
   
   clear(buf);
   a = aa;
   clear(qq);

   long a_len = deg(a) + 1;

   while (a_len > 0) {
      long old_buf_len = deg(buf) + 1;
      long amt = min(2*n-1-old_buf_len, a_len);

      LeftShift(buf, buf, amt);
      RightShift(tmp, a, a_len-amt);
      add(buf, buf, tmp);
      trunc(a, a, a_len-amt);

      PentDivRem21(qbuf, buf, buf, n, k3, k2, k1);
      a_len -= amt;

      ShiftAdd(qq, qbuf, a_len);
   }

   q = qq;
}



void rem(GF2X& r, const GF2X& a, const GF2XModulus& F)
{
   long n = F.n;

   if (n < 0) Error("rem: uninitialized modulus");

   if (F.method == GF2X_MOD_TRI) {
      TrinomReduce(r, a, n, F.k3);
      return;
   }

   if (F.method == GF2X_MOD_PENT) {
      PentReduce(r, a, n, F.k3, F.k2, F.k1);
      return;
   }

   long da = deg(a);


   if (da < n) {
      r = a;
   }
   else if (F.method == GF2X_MOD_MUL) {
      if (da <= 2*(n-1)) 
         UseMulRem21(r, a, F);
      else
         UseMulRemX1(r, a, F);
   }
   else if (F.method == GF2X_MOD_SPECIAL) {
      long sa = a.xrep.length();
      long posa = da - NTL_BITS_PER_LONG*(sa-1);
   
      _ntl_ulong *ap;
      if (&r == &a)
         ap = r.xrep.elts();
      else {
         GF2X_rembuf = a.xrep;
         ap = GF2X_rembuf.elts();
      }
   
      _ntl_ulong *atop = &ap[sa-1];
      _ntl_ulong *stab_top;
   
      long i;
   
      while (1) {
         if (atop[0] & (1UL << posa)) {
            stab_top = &F.stab1[posa << 1];
            i = F.stab_cnt[posa];
            atop[i] ^= stab_top[0];
            atop[i+1] ^= stab_top[1];
         }

         da--;
         if (da < n) break;

         posa--;
         if (posa < 0) {
            posa = NTL_BITS_PER_LONG-1;
            atop--;
         }
      }
   
      long sn = F.size;
      r.xrep.SetLength(sn);
      if (&r != &a) {
         _ntl_ulong *rp = r.xrep.elts();
         for (i = 0; i < sn; i++)
            rp[i] = ap[i];
      }
      r.xrep[sn-1] &= F.msk;
      r.normalize();
   }
   else {
      long sa = a.xrep.length();
      long posa = da - NTL_BITS_PER_LONG*(sa-1);
   
   
      _ntl_ulong *ap;
      if (&r == &a)
         ap = r.xrep.elts();
      else {
         GF2X_rembuf = a.xrep;
         ap = GF2X_rembuf.elts();
      }
   
      _ntl_ulong *atop = &ap[sa-1];
      _ntl_ulong *stab_top;
   
      long i;
   
      while (1) {
         if (atop[0] & (1UL << posa)) {
            stab_top = F.stab_ptr[posa];
            for (i = F.stab_cnt[posa]; i <= 0; i++)
               atop[i] ^= stab_top[i];
         }
   
         da--;
         if (da < n) break;

         posa--;
         if (posa < 0) {
            posa = NTL_BITS_PER_LONG-1;
            atop--;
         }
      }
   
      long sn = F.size;
      r.xrep.SetLength(sn);
      if (&r != &a) {
         _ntl_ulong *rp = r.xrep.elts();
         for (i = 0; i < sn; i++)
            rp[i] = ap[i];
      }
      r.normalize();
   }
}

void DivRem(GF2X& q, GF2X& r, const GF2X& a, const GF2XModulus& F)
{
   long da = deg(a);
   long n = F.n;

   if (n < 0) Error("DivRem: uninitialized modulus");

   if (da < n) {
      r = a;
      clear(q);
   }
   else if (F.method == GF2X_MOD_TRI) {
      if (da <= 2*(n-1)) 
         TriDivRem21(q, r, a, F.n, F.k3);
      else
         TriDivRemX1(q, r, a, F.n, F.k3);
   }
   else if (F.method == GF2X_MOD_PENT) {
      if (da <= 2*(n-1)) 
         PentDivRem21(q, r, a, F.n, F.k3, F.k2, F.k1);
      else
         PentDivRemX1(q, r, a, F.n, F.k3, F.k2, F.k1);
   }
   else if (F.method == GF2X_MOD_MUL) {
      if (da <= 2*(n-1)) 
         UseMulDivRem21(q, r, a, F);
      else
         UseMulDivRemX1(q, r, a, F);
   }
   else if (F.method == GF2X_MOD_SPECIAL) {
      long sa = a.xrep.length();
      long posa = da - NTL_BITS_PER_LONG*(sa-1);
   
      long dq = da - n;
      long sq = dq/NTL_BITS_PER_LONG + 1;
      long posq = dq - NTL_BITS_PER_LONG*(sq-1);
   
      _ntl_ulong *ap;
      if (&r == &a)
         ap = r.xrep.elts();
      else {
         GF2X_rembuf = a.xrep;
         ap = GF2X_rembuf.elts();
      }
   
      _ntl_ulong *atop = &ap[sa-1];
      _ntl_ulong *stab_top;

      long i;

      q.xrep.SetLength(sq);
      _ntl_ulong *qp = q.xrep.elts();
      for (i = 0; i < sq; i++)
         qp[i] = 0;

      _ntl_ulong *qtop = &qp[sq-1];

   
      while (1) {
         if (atop[0] & (1UL << posa)) {
            qtop[0] |= (1UL << posq);
            stab_top = &F.stab1[posa << 1];
            i = F.stab_cnt[posa];
            atop[i] ^= stab_top[0];
            atop[i+1] ^= stab_top[1];
         }
   
         da--;
         if (da < n) break;

         posa--;
         if (posa < 0) {
            posa = NTL_BITS_PER_LONG-1;
            atop--;
         }

         posq--;
         if (posq < 0) {
            posq = NTL_BITS_PER_LONG-1;
            qtop--;
         }
      }
   
      long sn = F.size;
      r.xrep.SetLength(sn);
      if (&r != &a) {
         _ntl_ulong *rp = r.xrep.elts();
         for (i = 0; i < sn; i++)
            rp[i] = ap[i];
      }
      r.xrep[sn-1] &= F.msk;
      r.normalize();
   }
   else {
      long sa = a.xrep.length();
      long posa = da - NTL_BITS_PER_LONG*(sa-1);
   
      long dq = da - n;
      long sq = dq/NTL_BITS_PER_LONG + 1;
      long posq = dq - NTL_BITS_PER_LONG*(sq-1);
   
      _ntl_ulong *ap;
      if (&r == &a)
         ap = r.xrep.elts();
      else {
         GF2X_rembuf = a.xrep;
         ap = GF2X_rembuf.elts();
      }
   
      _ntl_ulong *atop = &ap[sa-1];
      _ntl_ulong *stab_top;
   
      long i;

      q.xrep.SetLength(sq);
      _ntl_ulong *qp = q.xrep.elts();
      for (i = 0; i < sq; i++)
         qp[i] = 0;

      _ntl_ulong *qtop = &qp[sq-1];
   
      while (1) {
         if (atop[0] & (1UL << posa)) {
            qtop[0] |= (1UL << posq);
            stab_top = F.stab_ptr[posa];
            for (i = F.stab_cnt[posa]; i <= 0; i++)
               atop[i] ^= stab_top[i];
         }
   
         da--;
         if (da < n) break;

         posa--;
         if (posa < 0) {
            posa = NTL_BITS_PER_LONG-1;
            atop--;
         }

         posq--;
         if (posq < 0) {
            posq = NTL_BITS_PER_LONG-1;
            qtop--;
         }
      }
   
      long sn = F.size;
      r.xrep.SetLength(sn);
      if (&r != &a) {
         _ntl_ulong *rp = r.xrep.elts();
         for (i = 0; i < sn; i++)
            rp[i] = ap[i];
      }
      r.normalize();
   }
}



void div(GF2X& q, const GF2X& a, const GF2XModulus& F)
{
   long da = deg(a);
   long n = F.n;

   if (n < 0) Error("div: uninitialized modulus");


   if (da < n) {
      clear(q);
   }
   else if (F.method == GF2X_MOD_TRI) {
      if (da <= 2*(n-1)) 
         TriDiv21(q, a, F.n, F.k3);
      else
         TriDivX1(q, a, F.n, F.k3);
   }
   else if (F.method == GF2X_MOD_PENT) {
      if (da <= 2*(n-1)) 
         PentDiv21(q, a, F.n, F.k3, F.k2, F.k1);
      else
         PentDivX1(q, a, F.n, F.k3, F.k2, F.k1);
   }
   else if (F.method == GF2X_MOD_MUL) {
      if (da <= 2*(n-1)) 
         UseMulDiv21(q, a, F);
      else
         UseMulDivX1(q, a, F);
   }
   else if (F.method == GF2X_MOD_SPECIAL) {
      long sa = a.xrep.length();
      long posa = da - NTL_BITS_PER_LONG*(sa-1);
   
      long dq = da - n;
      long sq = dq/NTL_BITS_PER_LONG + 1;
      long posq = dq - NTL_BITS_PER_LONG*(sq-1);
   
      _ntl_ulong *ap;
      GF2X_rembuf = a.xrep;
      ap = GF2X_rembuf.elts();

   
      _ntl_ulong *atop = &ap[sa-1];
      _ntl_ulong *stab_top;

      long i;

      q.xrep.SetLength(sq);
      _ntl_ulong *qp = q.xrep.elts();
      for (i = 0; i < sq; i++)
         qp[i] = 0;

      _ntl_ulong *qtop = &qp[sq-1];
   
      while (1) {
         if (atop[0] & (1UL << posa)) {
            qtop[0] |= (1UL << posq);
            stab_top = &F.stab1[posa << 1];
            i = F.stab_cnt[posa];
            atop[i] ^= stab_top[0];
            atop[i+1] ^= stab_top[1];
         }
   
         da--;
         if (da < n) break;

         posa--;
         if (posa < 0) {
            posa = NTL_BITS_PER_LONG-1;
            atop--;
         }

         posq--;
         if (posq < 0) {
            posq = NTL_BITS_PER_LONG-1;
            qtop--;
         }
      }
   }
   else {
      long sa = a.xrep.length();
      long posa = da - NTL_BITS_PER_LONG*(sa-1);
   
      long dq = da - n;
      long sq = dq/NTL_BITS_PER_LONG + 1;
      long posq = dq - NTL_BITS_PER_LONG*(sq-1);
   
      _ntl_ulong *ap;
      GF2X_rembuf = a.xrep;
      ap = GF2X_rembuf.elts();
   
      _ntl_ulong *atop = &ap[sa-1];
      _ntl_ulong *stab_top;
   
      long i;

      q.xrep.SetLength(sq);
      _ntl_ulong *qp = q.xrep.elts();
      for (i = 0; i < sq; i++)
         qp[i] = 0;

      _ntl_ulong *qtop = &qp[sq-1];
   
      while (1) {
         if (atop[0] & (1UL << posa)) {
            qtop[0] |= (1UL << posq);
            stab_top = F.stab_ptr[posa];
            for (i = F.stab_cnt[posa]; i <= 0; i++)
               atop[i] ^= stab_top[i];
         }
   
         da--;
         if (da < n) break;

         posa--;
         if (posa < 0) {
            posa = NTL_BITS_PER_LONG-1;
            atop--;
         }

         posq--;
         if (posq < 0) {
            posq = NTL_BITS_PER_LONG-1;
            qtop--;
         }
      }
   }
}


void MulMod(GF2X& c, const GF2X& a, const GF2X& b, const GF2XModulus& F)
{
   if (F.n < 0) Error("MulMod: uninitialized modulus");

   GF2XRegister(t);
   mul(t, a, b);
   rem(c, t, F);
}


void SqrMod(GF2X& c, const GF2X& a, const GF2XModulus& F)
{
   if (F.n < 0) Error("SqrMod: uninitialized modulus");

   GF2XRegister(t);
   sqr(t, a);
   rem(c, t, F);
}


// we need these two versions to prevent a GF2XModulus
// from being constructed.


void MulMod(GF2X& c, const GF2X& a, const GF2X& b, const GF2X& f)
{
   GF2XRegister(t);
   mul(t, a, b);
   rem(c, t, f);
}

void SqrMod(GF2X& c, const GF2X& a, const GF2X& f)
{
   GF2XRegister(t);
   sqr(t, a);
   rem(c, t, f);
}


static
long OptWinSize(long n)
// finds k that minimizes n/(k+1) + 2^{k-1}

{
   long k;
   double v, v_new;


   v = n/2.0 + 1.0;
   k = 1;

   for (;;) {
      v_new = n/(double(k+2)) + double(1L << k);
      if (v_new >= v) break;
      v = v_new;
      k++;
   }

   return k;
}
      


void PowerMod(GF2X& h, const GF2X& g, const ZZ& e, const GF2XModulus& F)
// h = g^e mod f using "sliding window" algorithm
{
   if (deg(g) >= F.n) Error("PowerMod: bad args");

   if (e == 0) {
      set(h);
      return;
   }

   if (e == 1) {
      h = g;
      return;
   }

   if (e == -1) {
      InvMod(h, g, F);
      return;
   }

   if (e == 2) {
      SqrMod(h, g, F);
      return;
   }

   if (e == -2) {
      SqrMod(h, g, F);
      InvMod(h, h, F);
      return;
   }


   long n = NumBits(e);

   GF2X res;
   res.SetMaxLength(F.n);
   set(res);

   long i;

   if (n < 16) {
      // plain square-and-multiply algorithm

      for (i = n - 1; i >= 0; i--) {
         SqrMod(res, res, F);
         if (bit(e, i))
            MulMod(res, res, g, F);
      }

      if (e < 0) InvMod(res, res, F);

      h = res;
      return;
   }

   long k = OptWinSize(n);

   k = min(k, 9);

   vec_GF2X v;

   v.SetLength(1L << (k-1));

   v[0] = g;
 
   if (k > 1) {
      GF2X t;
      SqrMod(t, g, F);

      for (i = 1; i < (1L << (k-1)); i++)
         MulMod(v[i], v[i-1], t, F);
   }


   long val;
   long cnt;
   long m;

   val = 0;
   for (i = n-1; i >= 0; i--) {
      val = (val << 1) | bit(e, i); 
      if (val == 0)
         SqrMod(res, res, F);
      else if (val >= (1L << (k-1)) || i == 0) {
         cnt = 0;
         while ((val & 1) == 0) {
            val = val >> 1;
            cnt++;
         }

         m = val;
         while (m > 0) {
            SqrMod(res, res, F);
            m = m >> 1;
         }

         MulMod(res, res, v[val >> 1], F);

         while (cnt > 0) {
            SqrMod(res, res, F);
            cnt--;
         }

         val = 0;
      }
   }

   if (e < 0) InvMod(res, res, F);

   h = res;
}

   


void PowerXMod(GF2X& hh, const ZZ& e, const GF2XModulus& F)
{
   if (F.n < 0) Error("PowerXMod: uninitialized modulus");

   if (IsZero(e)) {
      set(hh);
      return;
   }

   long n = NumBits(e);
   long i;

   GF2X h;

   h.SetMaxLength(F.n+1);
   set(h);

   for (i = n - 1; i >= 0; i--) {
      SqrMod(h, h, F);
      if (bit(e, i)) {
         MulByX(h, h);
         if (coeff(h, F.n) != 0)
            add(h, h, F.f);
      }
   }

   if (e < 0) InvMod(h, h, F);

   hh = h;
}


      


void UseMulRem(GF2X& r, const GF2X& a, const GF2X& b)
{
   GF2XRegister(P1);
   GF2XRegister(P2);

   long da = deg(a);
   long db = deg(b);

   CopyReverse(P1, b, db);
   InvTrunc(P2, P1, da-db+1);
   CopyReverse(P1, P2, da-db);

   RightShift(P2, a, db);
   mul(P2, P1, P2);
   RightShift(P2, P2, da-db);
   mul(P1, P2, b);
   add(P1, P1, a);
   
   r = P1;
}

void UseMulDivRem(GF2X& q, GF2X& r, const GF2X& a, const GF2X& b)
{
   GF2XRegister(P1);
   GF2XRegister(P2);

   long da = deg(a);
   long db = deg(b);

   CopyReverse(P1, b, db);
   InvTrunc(P2, P1, da-db+1);
   CopyReverse(P1, P2, da-db);

   RightShift(P2, a, db);
   mul(P2, P1, P2);
   RightShift(P2, P2, da-db);
   mul(P1, P2, b);
   add(P1, P1, a);
   
   r = P1;
   q = P2;
}

void UseMulDiv(GF2X& q, const GF2X& a, const GF2X& b)
{
   GF2XRegister(P1);
   GF2XRegister(P2);

   long da = deg(a);
   long db = deg(b);

   CopyReverse(P1, b, db);
   InvTrunc(P2, P1, da-db+1);
   CopyReverse(P1, P2, da-db);

   RightShift(P2, a, db);
   mul(P2, P1, P2);
   RightShift(P2, P2, da-db);
   
   q = P2;
}


const long GF2X_DIV_CROSS = 100; 

void DivRem(GF2X& q, GF2X& r, const GF2X& a, const GF2X& b)
{
   long sa = a.xrep.length();
   long sb = b.xrep.length();

   if (sb < GF2X_DIV_CROSS || sa-sb < GF2X_DIV_CROSS)
      PlainDivRem(q, r, a, b);
   else if (sa < 4*sb)
      UseMulDivRem(q, r, a, b);
   else {
      GF2XModulus B;
      build(B, b);
      DivRem(q, r, a, B);
   }
}

void div(GF2X& q, const GF2X& a, const GF2X& b)
{
   long sa = a.xrep.length();
   long sb = b.xrep.length();

   if (sb < GF2X_DIV_CROSS || sa-sb < GF2X_DIV_CROSS)
      PlainDiv(q, a, b);
   else if (sa < 4*sb)
      UseMulDiv(q, a, b);
   else {
      GF2XModulus B;
      build(B, b);
      div(q, a, B);
   }
}

void rem(GF2X& r, const GF2X& a, const GF2X& b)
{
   long sa = a.xrep.length();
   long sb = b.xrep.length();

   if (sb < GF2X_DIV_CROSS || sa-sb < GF2X_DIV_CROSS)
      PlainRem(r, a, b);
   else if (sa < 4*sb)
      UseMulRem(r, a, b);
   else {
      GF2XModulus B;
      build(B, b);
      rem(r, a, B);
   }
}


static inline 
void swap(_ntl_ulong_ptr& a, _ntl_ulong_ptr& b)  
{  _ntl_ulong_ptr t;  t = a; a = b; b = t; }




static
void BaseGCD(GF2X& d, const GF2X& a_in, const GF2X& b_in)
{
   GF2XRegister(a);
   GF2XRegister(b);
   
   if (IsZero(a_in)) {
      d = b_in;
      return;
   }

   if (IsZero(b_in)) {
      d = a_in;
      return;
   }
      
   a.xrep.SetMaxLength(a_in.xrep.length()+1);
   b.xrep.SetMaxLength(b_in.xrep.length()+1);

   a = a_in;
   b = b_in;

   _ntl_ulong *ap = a.xrep.elts();
   _ntl_ulong *bp = b.xrep.elts();

   long da = deg(a);
   long wa = da/NTL_BITS_PER_LONG;
   long ba = da - wa*NTL_BITS_PER_LONG;

   long db = deg(b);
   long wb = db/NTL_BITS_PER_LONG;
   long bb = db - wb*NTL_BITS_PER_LONG;

   long parity = 0;

   for (;;) {
      if (da < db) {
         swap(ap, bp);
         swap(da, db);
         swap(wa, wb);
         swap(ba, bb);
         parity = 1 - parity;
      }

      // da >= db

      if (db == -1) break;

      ShiftAdd(ap, bp, wb+1, da-db);

      _ntl_ulong msk = 1UL << ba;
      _ntl_ulong aa = ap[wa];

      while ((aa & msk) == 0) {
         da--;
         msk = msk >> 1;
         ba--;
         if (!msk) {
            wa--;
            ba = NTL_BITS_PER_LONG-1;
            msk = 1UL << (NTL_BITS_PER_LONG-1);
            if (wa < 0) break;
            aa = ap[wa];
         }
      }
   }

   a.normalize();
   b.normalize();

   if (!parity) {
      d = a;
   }
   else {
      d = b;
   }
}


void GCD(GF2X& d, const GF2X& a, const GF2X& b)
{
   long sa = a.xrep.length();
   long sb = b.xrep.length();

   if (sb >= 10 && 2*sa > 3*sb) {
      GF2XRegister(r);

      rem(r, a, b);
      BaseGCD(d, b, r);
   }
   else if (sa >= 10 && 2*sb > 3*sa) {
      GF2XRegister(r);

      rem(r, b, a);
      BaseGCD(d, a, r);
   }
   else {
      BaseGCD(d, a, b);
   }
}





#define XX_STEP(ap,da,wa,ba,rp,sr,bp,db,wb,bb,sp,ss)  \
      long delta = da-db;  \
  \
      if (delta == 0) {  \
         long i;  \
         for (i = wb; i >= 0; i--) ap[i] ^= bp[i];  \
         for (i = ss-1; i >= 0; i--) rp[i] ^= sp[i];  \
         if (ss > sr) sr = ss; \
      }  \
      else if (delta == 1) {  \
         long i; \
         _ntl_ulong tt, tt1;  \
  \
         tt = bp[wb] >> (NTL_BITS_PER_LONG-1);  \
         if (tt) ap[wb+1] ^= tt;  \
         tt = bp[wb];  \
         for (i = wb; i >= 1; i--)  \
            tt1 = bp[i-1], ap[i] ^= (tt << 1) | (tt1 >> (NTL_BITS_PER_LONG-1)),  \
            tt = tt1; \
         ap[0] ^= tt << 1;  \
  \
         if (ss > 0) {  \
            long t = ss; \
            tt = sp[ss-1] >> (NTL_BITS_PER_LONG-1);  \
            if (tt) rp[ss] ^= tt, t++;  \
            tt = sp[ss-1]; \
            for (i = ss-1; i >= 1; i--)  \
               tt1=sp[i-1],  \
               rp[i] ^= (tt << 1) | (tt1 >> (NTL_BITS_PER_LONG-1)),  \
               tt = tt1; \
            rp[0] ^= tt << 1;  \
            if (t > sr) sr = t; \
         }  \
      }  \
      else if (delta < NTL_BITS_PER_LONG) {  \
         long i; \
         _ntl_ulong tt, tt1;  \
         long rdelta = NTL_BITS_PER_LONG-delta; \
  \
         tt = bp[wb] >> rdelta;  \
         if (tt) ap[wb+1] ^= tt;  \
         tt=bp[wb]; \
         for (i = wb; i >= 1; i--)  \
            tt1=bp[i-1], ap[i] ^= (tt << delta) | (tt1 >> rdelta),  \
            tt=tt1; \
         ap[0] ^= tt << delta;  \
  \
         if (ss > 0) {  \
            long t = ss; \
            tt = sp[ss-1] >> rdelta;  \
            if (tt) rp[ss] ^= tt, t++;  \
            tt=sp[ss-1]; \
            for (i = ss-1; i >= 1; i--)  \
               tt1=sp[i-1], rp[i] ^= (tt << delta) | (tt1 >> rdelta),  \
               tt=tt1; \
            rp[0] ^= tt << delta;  \
            if (t > sr) sr = t; \
         }  \
      }  \
      else {  \
         ShiftAdd(ap, bp, wb+1, da-db);  \
         ShiftAdd(rp, sp, ss, da-db);  \
         long t = ss + (da-db+NTL_BITS_PER_LONG-1)/NTL_BITS_PER_LONG;  \
         if (t > sr) {  \
            while (t > 0 && rp[t-1] == 0) t--;   \
            sr = t;  \
         }  \
      } \
  \
      _ntl_ulong msk = 1UL << ba;  \
      _ntl_ulong aa = ap[wa];  \
  \
      while ((aa & msk) == 0) {  \
         da--;  \
         msk = msk >> 1;  \
         ba--;  \
         if (!msk) {  \
            wa--;  \
            ba = NTL_BITS_PER_LONG-1;  \
            msk = 1UL << (NTL_BITS_PER_LONG-1);  \
            if (wa < 0) break;  \
            aa = ap[wa];  \
         }  \
      }  \




static
void XXGCD(GF2X& d, GF2X& r_out, const GF2X& a_in, const GF2X& b_in)
{
   GF2XRegister(a);
   GF2XRegister(b);
   GF2XRegister(r);
   GF2XRegister(s);

   if (IsZero(b_in)) {
      d = a_in;
      set(r_out);
      return;
   }

   if (IsZero(a_in)) {
      d = b_in;
      clear(r_out);
      return;
   }
      
   a.xrep.SetMaxLength(a_in.xrep.length()+1);
   b.xrep.SetMaxLength(b_in.xrep.length()+1);

   long max_sz = max(a_in.xrep.length(), b_in.xrep.length());
   r.xrep.SetLength(max_sz+1);
   s.xrep.SetLength(max_sz+1);

   _ntl_ulong *rp = r.xrep.elts();
   _ntl_ulong *sp = s.xrep.elts();

   long i;
   for (i = 0; i <= max_sz; i++) {
      rp[i] = sp[i] = 0;
   }

   rp[0] = 1;

   long sr = 1;
   long ss = 0;

   a = a_in;
   b = b_in;

   _ntl_ulong *ap = a.xrep.elts();
   _ntl_ulong *bp = b.xrep.elts();

   long da = deg(a);
   long wa = da/NTL_BITS_PER_LONG;
   long ba = da - wa*NTL_BITS_PER_LONG;

   long db = deg(b);
   long wb = db/NTL_BITS_PER_LONG;
   long bb = db - wb*NTL_BITS_PER_LONG;

   long parity = 0;


   for (;;) {
      if (da == -1 || db == -1) break;

      if (da < db || (da == db && parity)) {
         if (da < db && !parity) parity = 1;
         XX_STEP(bp,db,wb,bb,sp,ss,ap,da,wa,ba,rp,sr)

      }
      else {
         parity = 0;
         XX_STEP(ap,da,wa,ba,rp,sr,bp,db,wb,bb,sp,ss)
      }
   }

   a.normalize();
   b.normalize();
   r.normalize();
   s.normalize();

   if (db == -1) {
      d = a;
      r_out = r;
   }
   else {
      d = b;
      r_out = s;
   }
}



static
void BaseXGCD(GF2X& d, GF2X& s, GF2X& t, const GF2X& a, const GF2X& b)
{
   if (IsZero(b)) {
      d = a;
      set(s);
      clear(t);
   }
   else {
      GF2XRegister(t1);
      GF2XRegister(b1);

      b1 = b;
      XXGCD(d, s, a, b);
      mul(t1, a, s);
      add(t1, t1, d);
      div(t, t1, b1);
   }
}




void XGCD(GF2X& d, GF2X& s, GF2X& t, const GF2X& a, const GF2X& b)
{
   long sa = a.xrep.length();
   long sb = b.xrep.length();


   if (sb >= 10 && 2*sa > 3*sb) {
      GF2XRegister(r);
      GF2XRegister(q);
      GF2XRegister(s1);
      GF2XRegister(t1);


      DivRem(q, r, a, b);
      BaseXGCD(d, s1, t1, b, r);

      
      mul(r, t1, q);
      add(r, r, s1);  // r = s1 - t1*q, but sign doesn't matter

      s = t1;
      t = r;   
   }
   else if (sa >= 10 && 2*sb > 3*sa) {
      GF2XRegister(r);
      GF2XRegister(q);
      GF2XRegister(s1);
      GF2XRegister(t1);


      DivRem(q, r, b, a);
      BaseXGCD(d, s1, t1, a, r);

      
      mul(r, t1, q);
      add(r, r, s1);  // r = s1 - t1*q, but sign doesn't matter

      t = t1;
      s = r;  
   }
   else {
      BaseXGCD(d, s, t, a, b);
   }

}




static
void BaseInvMod(GF2X& d, GF2X& s, const GF2X& a, const GF2X& f)
{
   if (deg(a) >= deg(f) || deg(f) == 0) Error("InvMod: bad args");

   long sa = a.xrep.length();
   long sf = f.xrep.length();

   if (sa >= 10 && 2*sf > 3*sa) {
      GF2XRegister(t);

      XGCD(d, s, t, a, f);
   }
   else {
      XXGCD(d, s, a, f);
   }

}



void InvMod(GF2X& c, const GF2X& a, const GF2X& f)
{ 
   GF2XRegister(d);
   GF2XRegister(s);
   BaseInvMod(d, s, a, f);

   if (!IsOne(d)) Error("InvMod: inverse undefined");

   c = s;
}



long InvModStatus(GF2X& c, const GF2X& a, const GF2X& f)
{ 
   GF2XRegister(d);
   GF2XRegister(s);
   BaseInvMod(d, s, a, f);

   if (!IsOne(d)) {
      c = d;
      return 1;
   }

   c = s;
   return 0;
}



   
void diff(GF2X& c, const GF2X& a)
{
   RightShift(c, a, 1);
   
   // clear odd coeffs

   long dc = deg(c);
   long i;
   for (i = 1; i <= dc; i += 2)
      SetCoeff(c, i, 0);
}

void conv(GF2X& c, long a)
{
   if (a & 1)
      set(c);
   else
      clear(c);
}

void conv(GF2X& c, GF2 a)
{
   if (a == 1)
      set(c);
   else
      clear(c);
}

void conv(GF2X& x, const vec_GF2& a)
{
   x.xrep = a.rep;
   x.normalize();
}

void conv(vec_GF2& x, const GF2X& a)
{
   VectorCopy(x, a, deg(a)+1);
}

void VectorCopy(vec_GF2& x, const GF2X& a, long n)
{
   if (n < 0) Error("VectorCopy: negative length"); 

   if (NTL_OVERFLOW(n, 1, 0))
      Error("overflow in VectorCopy");

   long wa = a.xrep.length();
   long wx = (n + NTL_BITS_PER_LONG - 1)/NTL_BITS_PER_LONG;

   long wmin = min(wa, wx);

   x.SetLength(n);

   const _ntl_ulong *ap = a.xrep.elts();
   _ntl_ulong *xp = x.rep.elts();

   long i;
   for (i = 0; i < wmin; i++)
      xp[i] = ap[i];

   if (wa < wx) {
      for (i = wa; i < wx; i++)
         xp[i] = 0;
   }
   else {
      long p = n % NTL_BITS_PER_LONG;
      if (p != 0)
         xp[wx-1] &= (1UL << p) - 1UL;
   }
}


void add(GF2X& c, const GF2X& a, long b)
{
   c = a;
   if (b & 1) {
      long n = c.xrep.length();
      if (n == 0) 
         set(c);
      else {
         c.xrep[0] ^= 1;
         if (n == 1 && !c.xrep[0]) c.xrep.SetLength(0);
      }
   }
}

void add(GF2X& c, const GF2X& a, GF2 b)
{
   add(c, a, rep(b));
}


void MulTrunc(GF2X& c, const GF2X& a, const GF2X& b, long n)
{
   GF2XRegister(t);

   mul(t, a, b);
   trunc(c, t, n);
}

void SqrTrunc(GF2X& c, const GF2X& a, long n)
{
   GF2XRegister(t);

   sqr(t, a);
   trunc(c, t, n);
}


long divide(GF2X& q, const GF2X& a, const GF2X& b)
{
   if (IsZero(b)) {
      if (IsZero(a)) {
         clear(q);
         return 1;
      }
      else
         return 0;
   }

   GF2XRegister(lq);
   GF2XRegister(r);

   DivRem(lq, r, a, b);
   if (!IsZero(r)) return 0;
   q = lq;
   return 1;
}

long divide(const GF2X& a, const GF2X& b)
{
   if (IsZero(b)) return IsZero(a);
   GF2XRegister(r);
   rem(r, a, b);
   if (!IsZero(r)) return 0;
   return 1;
}



/*** modular composition routines and data structures ***/


void InnerProduct(GF2X& x, const GF2X& v, long dv, long low, long high, 
                   const vec_GF2X& H, long n, WordVector& t)
{
   long i, j;

   _ntl_ulong *tp = t.elts();

   for (i = 0; i < n; i++)
      tp[i] = 0;


   long w_low = low/NTL_BITS_PER_LONG;
   long b_low = low - w_low*NTL_BITS_PER_LONG;

   
   const _ntl_ulong *vp = &v.xrep[w_low];
   _ntl_ulong msk = 1UL << b_low;
   _ntl_ulong vv = *vp;

   high = min(high, dv);

   i = low;
   for (;;) {
      if (vv & msk) {
         const WordVector& h = H[i-low].xrep;
         long m = h.length();
         const _ntl_ulong *hp = h.elts();
         for (j = 0; j < m; j++)
            tp[j] ^= hp[j];
      }

      i++;
      if (i > high) break;

      msk = msk << 1;
      if (!msk) {
         msk = 1UL;
         vp++;
         vv = *vp;
      }
   }

   x.xrep = t;
   x.normalize();
}


void CompMod(GF2X& x, const GF2X& g, const GF2XArgument& A, const GF2XModulus& F)
{
   long dg = deg(g);
   if (dg <= 0) {
      x = g;
      return;
   }

   GF2X s, t;
   WordVector scratch(INIT_SIZE, F.size);

   long m = A.H.length() - 1;
   long l = (((dg+1)+m-1)/m) - 1;

   InnerProduct(t, g, dg, l*m, l*m + m - 1, A.H, F.size, scratch);
   for (long i = l-1; i >= 0; i--) {
      InnerProduct(s, g, dg, i*m, i*m + m - 1, A.H, F.size, scratch);
      MulMod(t, t, A.H[m], F);
      add(t, t, s);
   }

   x = t;
}

void build(GF2XArgument& A, const GF2X& h, const GF2XModulus& F, long m)
{
   if (m <= 0 || deg(h) >= F.n) Error("build GF2XArgument: bad args");

   if (m > F.n) m = F.n;

   long i;

   A.H.SetLength(m+1);

   set(A.H[0]);
   A.H[1] = h;
   for (i = 2; i <= m; i++) 
      MulMod(A.H[i], A.H[i-1], h, F);
}


void CompMod(GF2X& x, const GF2X& g, const GF2X& h, const GF2XModulus& F)
   // x = g(h) mod f
{
   long m = SqrRoot(deg(g)+1);

   if (m == 0) {
      clear(x);
      return;
   }

   GF2XArgument A;

   build(A, h, F, m);

   CompMod(x, g, A, F);
}




void Comp2Mod(GF2X& x1, GF2X& x2, const GF2X& g1, const GF2X& g2,
              const GF2X& h, const GF2XModulus& F)

{
   long m = SqrRoot(deg(g1) + deg(g2) + 2);

   if (m == 0) {
      clear(x1);
      clear(x2);
      return;
   }

   GF2XArgument A;

   build(A, h, F, m);

   GF2X xx1, xx2;

   CompMod(xx1, g1, A, F);
   CompMod(xx2, g2, A, F);

   x1 = xx1;
   x2 = xx2;
}

void Comp3Mod(GF2X& x1, GF2X& x2, GF2X& x3, 
              const GF2X& g1, const GF2X& g2, const GF2X& g3,
              const GF2X& h, const GF2XModulus& F)

{
   long m = SqrRoot(deg(g1) + deg(g2) + deg(g3) + 3);

   if (m == 0) {
      clear(x1);
      clear(x2);
      clear(x3);
      return;
   }

   GF2XArgument A;

   build(A, h, F, m);

   GF2X xx1, xx2, xx3;

   CompMod(xx1, g1, A, F);
   CompMod(xx2, g2, A, F);
   CompMod(xx3, g3, A, F);

   x1 = xx1;
   x2 = xx2;
   x3 = xx3;
}



void build(GF2XTransMultiplier& B, const GF2X& b, const GF2XModulus& F)
{
   long db = deg(b);

   if (db >= F.n) Error("build TransMultiplier: bad args");

   GF2X t;

   LeftShift(t, b, F.n-1);
   div(t, t, F);

   // we optimize for low degree b

   long d;

   d = deg(t);
   if (d < 0)
      B.shamt_fbi = 0;
   else
      B.shamt_fbi = F.n-2 - d; 

   CopyReverse(B.fbi, t, d);

   if (F.method != GF2X_MOD_TRI && F.method != GF2X_MOD_PENT) {
   
      // The following code optimizes the case when 
      // f = X^n + low degree poly
   
      trunc(t, F.f, F.n);
      d = deg(t);
      if (d < 0)
         B.shamt = 0;
      else
         B.shamt = d;

      CopyReverse(B.f0, t, d);
   }


   if (db < 0)
      B.shamt_b = 0;
   else
      B.shamt_b = db;

   CopyReverse(B.b, b, db);
}

void TransMulMod(GF2X& x, const GF2X& a, const GF2XTransMultiplier& B,
               const GF2XModulus& F)
{
   if (deg(a) >= F.n) Error("TransMulMod: bad args");

   GF2XRegister(t1);
   GF2XRegister(t2);
   GF2XRegister(t3);

   mul(t1, a, B.b);
   RightShift(t1, t1, B.shamt_b);

   if (F.method == GF2X_MOD_TRI) {
      RightShift(t2, a, F.k3);
      add(t2, t2, a);
   }
   else if (F.method == GF2X_MOD_PENT) {
      RightShift(t2, a, F.k3);
      RightShift(t3, a, F.k2);
      add(t2, t2, t3);
      RightShift(t3, a, F.k1);
      add(t2, t2, t3);
      add(t2, t2, a);
   }
   else {
      mul(t2, a, B.f0);
      RightShift(t2, t2, B.shamt);
   }

   trunc(t2, t2, F.n-1);

   mul(t2, t2, B.fbi);
   if (B.shamt_fbi > 0) LeftShift(t2, t2, B.shamt_fbi);
   trunc(t2, t2, F.n-1);
   MulByX(t2, t2);

   add(x, t1, t2);
}

void UpdateMap(vec_GF2& x, const vec_GF2& a, const GF2XTransMultiplier& B,
       const GF2XModulus& F)
{
   GF2XRegister(xx);
   GF2XRegister(aa);
   conv(aa, a);
   TransMulMod(xx, aa, B, F);
   conv(x, xx);
}
   

void ProjectPowers(GF2X& x, const GF2X& a, long k, const GF2XArgument& H,
                   const GF2XModulus& F)
{
   long n = F.n;

   if (deg(a) >= n || k < 0 || NTL_OVERFLOW(k, 1, 0)) 
      Error("ProjectPowers: bad args");

   long m = H.H.length()-1;
   long l = (k+m-1)/m - 1;

   GF2XTransMultiplier M;
   build(M, H.H[m], F);

   GF2X s;
   s = a;

   x.SetMaxLength(k);
   clear(x);

   long i;

   for (i = 0; i <= l; i++) {
      long m1 = min(m, k-i*m);
      for (long j = 0; j < m1; j++)
         SetCoeff(x, i*m+j, InnerProduct(H.H[j].xrep, s.xrep));
      if (i < l)
         TransMulMod(s, s, M, F);
   }
}


void ProjectPowers(vec_GF2& x, const vec_GF2& a, long k, 
                   const GF2XArgument& H, const GF2XModulus& F)
{
   GF2X xx;
   ProjectPowers(xx, to_GF2X(a), k, H, F);
   VectorCopy(x, xx, k);
}


void ProjectPowers(GF2X& x, const GF2X& a, long k, const GF2X& h, 
                   const GF2XModulus& F)
{
   if (deg(a) >= F.n || k < 0) Error("ProjectPowers: bad args");

   if (k == 0) {
      clear(x);
      return;
   }

   long m = SqrRoot(k);

   GF2XArgument H;
   build(H, h, F, m);

   ProjectPowers(x, a, k, H, F);
}

void ProjectPowers(vec_GF2& x, const vec_GF2& a, long k, const GF2X& H,
                   const GF2XModulus& F)
{
   GF2X xx;
   ProjectPowers(xx, to_GF2X(a), k, H, F);
   VectorCopy(x, xx, k);
}


void MinPolyInternal(GF2X& h, const GF2X& x, long m)
{
   GF2X a, b, r, s;
   GF2X a_in, b_in;

   if (IsZero(x)) {
      set(h);
      return;
   }

   clear(a_in);
   SetCoeff(a_in, 2*m);

   CopyReverse(b_in, x, 2*m-1);
      
   a.xrep.SetMaxLength(a_in.xrep.length()+1);
   b.xrep.SetMaxLength(b_in.xrep.length()+1);

   long max_sz = max(a_in.xrep.length(), b_in.xrep.length());
   r.xrep.SetLength(max_sz+1);
   s.xrep.SetLength(max_sz+1);

   _ntl_ulong *rp = r.xrep.elts();
   _ntl_ulong *sp = s.xrep.elts();

   long i;
   for (i = 0; i <= max_sz; i++) {
      rp[i] = sp[i] = 0;
   }

   sp[0] = 1;

   long sr = 0;
   long ss = 1;

   a = a_in;
   b = b_in;

   _ntl_ulong *ap = a.xrep.elts();
   _ntl_ulong *bp = b.xrep.elts();

   long da = deg(a);
   long wa = da/NTL_BITS_PER_LONG;
   long ba = da - wa*NTL_BITS_PER_LONG;

   long db = deg(b);
   long wb = db/NTL_BITS_PER_LONG;
   long bb = db - wb*NTL_BITS_PER_LONG;

   long parity = 0;

   for (;;) {
      if (da < db) {
         swap(ap, bp);
         swap(da, db);
         swap(wa, wb);
         swap(ba, bb);
         parity = 1 - parity;

         swap(rp, sp);
         swap(sr, ss);
      }

      // da >= db

      if (db < m) break;

      ShiftAdd(ap, bp, wb+1, da-db);
      ShiftAdd(rp, sp, ss, da-db);
      long t = ss + (da-db+NTL_BITS_PER_LONG-1)/NTL_BITS_PER_LONG;
      if (t > sr) {
         while (t > 0 && rp[t-1] == 0) t--; 
         sr = t;
      }

      _ntl_ulong msk = 1UL << ba;
      _ntl_ulong aa = ap[wa];

      while ((aa & msk) == 0) {
         da--;
         msk = msk >> 1;
         ba--;
         if (!msk) {
            wa--;
            ba = NTL_BITS_PER_LONG-1;
            msk = 1UL << (NTL_BITS_PER_LONG-1);
            if (wa < 0) break;
            aa = ap[wa];
         }
      }
   }

   a.normalize();
   b.normalize();
   r.normalize();
   s.normalize();

   if (!parity) {
      h = s;
   }
   else {
      h = r;
   }
}


void DoMinPolyMod(GF2X& h, const GF2X& g, const GF2XModulus& F, long m, 
               const GF2X& R)
{
   GF2X x;

   ProjectPowers(x, R, 2*m, g, F);
   MinPolyInternal(h, x, m);
}

void MinPolySeq(GF2X& h, const vec_GF2& a, long m)
{
   if (m < 0 || NTL_OVERFLOW(m, 1, 0)) Error("MinPoly: bad args");
   if (a.length() < 2*m) Error("MinPoly: sequence too short");
   GF2X x;
   x.xrep = a.rep;
   x.normalize();
   MinPolyInternal(h, x, m);
}

void ProbMinPolyMod(GF2X& h, const GF2X& g, const GF2XModulus& F, long m)
{
   long n = F.n;
   if (m < 1 || m > n) Error("ProbMinPoly: bad args");

   GF2X R;
   random(R, n);

   DoMinPolyMod(h, g, F, m, R);
}

void ProbMinPolyMod(GF2X& h, const GF2X& g, const GF2XModulus& F)
{
   ProbMinPolyMod(h, g, F, F.n);
}

void MinPolyMod(GF2X& hh, const GF2X& g, const GF2XModulus& F, long m)
{
   GF2X h, h1;
   long n = F.n;
   if (m < 1 || m > n) Error("MinPoly: bad args");

   /* probabilistically compute min-poly */

   ProbMinPolyMod(h, g, F, m);
   if (deg(h) == m) { hh = h; return; }
   CompMod(h1, h, g, F);
   if (IsZero(h1)) { hh = h; return; }

   /* not completely successful...must iterate */


   GF2X h2, h3;
   GF2X R;
   GF2XTransMultiplier H1;
   

   for (;;) {
      random(R, n);
      build(H1, h1, F);
      TransMulMod(R, R, H1, F);
      DoMinPolyMod(h2, g, F, m-deg(h), R);

      mul(h, h, h2);
      if (deg(h) == m) { hh = h; return; }
      CompMod(h3, h2, g, F);
      MulMod(h1, h3, h1, F);
      if (IsZero(h1)) { hh = h; return; }
   }
}

void IrredPolyMod(GF2X& h, const GF2X& g, const GF2XModulus& F, long m)
{
   if (m < 1 || m > F.n) Error("IrredPoly: bad args");

   GF2X R;
   set(R);

   DoMinPolyMod(h, g, F, m, R);
}



void IrredPolyMod(GF2X& h, const GF2X& g, const GF2XModulus& F)
{
   IrredPolyMod(h, g, F, F.n);
}



void MinPolyMod(GF2X& hh, const GF2X& g, const GF2XModulus& F)
{
   MinPolyMod(hh, g, F, F.n);
}



void MulByXMod(GF2X& c, const GF2X& a, const GF2XModulus& F)
{
   long da = deg(a);
   long df = deg(F);
   if (da >= df) Error("MulByXMod: bad args"); 

   MulByX(c, a);

   if (da >= 0 && da == df-1)
      add(c, c, F);
}

static
void MulByXModAux(GF2X& c, const GF2X& a, const GF2X& f)
{
   long da = deg(a);
   long df = deg(f);
   if (da >= df) Error("MulByXMod: bad args"); 

   MulByX(c, a);

   if (da >= 0 && da == df-1)
      add(c, c, f);
}

void MulByXMod(GF2X& h, const GF2X& a, const GF2X& f)
{
   if (&h == &f) {
      GF2X hh;
      MulByXModAux(hh, a, f);
      h = hh;
   }
   else
      MulByXModAux(h, a, f);
}




void power(GF2X& x, const GF2X& a, long e)
{
   if (e < 0) {
      Error("power: negative exponent");
   }

   if (e == 0) {
      x = 1;
      return;
   }

   if (a == 0 || a == 1) {
      x = a;
      return;
   }

   long da = deg(a);

   if (da > (NTL_MAX_LONG-1)/e)
      Error("overflow in power");

   GF2X res;
   res.SetMaxLength(da*e + 1);
   res = 1;
   
   long k = NumBits(e);
   long i;

   for (i = k - 1; i >= 0; i--) {
      sqr(res, res);
      if (bit(e, i))
         mul(res, res, a);
   }

   x = res;
}


static
void FastTraceVec(vec_GF2& S, const GF2XModulus& f)
{
   long n = deg(f);

   if (n <= 0) Error("TraceVec: bad args");

   GF2X x = reverse(-LeftShift(reverse(diff(reverse(f)), n-1), n-1)/f, n-1);

   VectorCopy(S, x, n);
   S.put(0, to_GF2(n));
}

static
void PlainTraceVec(vec_GF2& S, const GF2X& f)
{
   long n = deg(f);

   if (n <= 0) 
      Error("TraceVec: bad args");

   if (n == 0) {
      S.SetLength(0);
      return;
   }

   GF2X x = reverse(-LeftShift(reverse(diff(reverse(f)), n-1), n-1)/f, n-1);

   VectorCopy(S, x, n); 
   S.put(0, to_GF2(n));
}


void TraceVec(vec_GF2& S, const GF2X& f)
{
   PlainTraceVec(S, f);
}

static
void ComputeTraceVec(const GF2XModulus& F)
{
   vec_GF2& S = *((vec_GF2 *) &F.tracevec);

   if (S.length() > 0)
      return;

   if (F.method == GF2X_MOD_PLAIN) {
      PlainTraceVec(S, F.f);
   }
   else {
      FastTraceVec(S, F);
   }
}

void TraceMod(GF2& x, const GF2X& a, const GF2XModulus& F)
{
   long n = F.n;

   if (deg(a) >= n)
      Error("trace: bad args");

   if (F.tracevec.length() == 0) 
      ComputeTraceVec(F);

   project(x, F.tracevec, a);
}

void TraceMod(GF2& x, const GF2X& a, const GF2X& f)
{
   if (deg(a) >= deg(f) || deg(f) <= 0)
      Error("trace: bad args");

   project(x, TraceVec(f), a);
}


NTL_END_IMPL


syntax highlighted by Code2HTML, v. 0.9.1