typedef long long Int64;
typedef int   Int32;
typedef short Int16;
typedef char  Int8;
typedef unsigned long long UInt64;
typedef unsigned int   UInt32;
typedef unsigned long  ULong;
typedef unsigned short UInt16;
typedef unsigned char  UInt8;

// for dll creation
#define static

// Intrinsic functions (insted of #include <intrin.h>)
UInt64 __emulu(UInt32 ua, UInt32 ub); // <- 64bit ua*ub (no intrinsic division!)

__forceinline UInt8 _BitScanForward(  UInt32 *Index, UInt32 Mask);// bsf, <- 0 if Mask=0, Index <- LS bit position
__forceinline UInt8 _BitScanReverse(  UInt32 *Index, UInt32 Mask);// bsr, <- 0 if Mask=0, Index <- MS bit

#define XTR  2
#define SGN(x) x[0]
#define LEN(x) x[1]
#define LSW(x) x[2]
#define LST(x) XTR-1+LEN(x)
#define MSW(x) x[LST(x)]
#define SET(x,v) SGN(x)=1; LEN(x) = 1; LSW(x) = v
#define SWP(x,y) t = x; x = y; y = t

__forceinline static UInt32 UDIV(UInt32 LS, UInt32 MS, UInt32 d, UInt32 *r) { // <- n64/u, r <- n64%u. (n64=MS|LS)
   __asm {
      mov eax, LS
      mov edx, MS
      mov ecx, r
      div d
      mov DWORD PTR [ecx], edx
   }
}

__forceinline static UInt32 ADC(UInt32 x, UInt32 y, UInt32 *c) { // <- x + y + c, c <- new carry
   __asm {                      // x += y + *c; *c = carry; return x;
      xor edx, edx              // edx = 0
      mov ecx, c                // ecx = c
      mov eax, dword ptr[ecx]   // eax = *c
      add eax, y                // eax += y, set carry
      adc edx, 0                // edx = carry
      add eax, x                // eax += x, eax --> return
      adc edx, 0                // edx += carry
      mov dword ptr[ecx], edx   // *c = carry
   }
}

__forceinline static UInt32 SBB(UInt32 x, UInt32 y, UInt32 *b) { // <- x - y - b, b <- new borrow
   __asm {                      // x -= y + *b; *b = borrow; return x;
      xor edx, edx              // edx = 0
      mov ecx, b                // ecx = b
      mov eax, x                // eax = x
      sub eax, dword ptr[ecx]   // eax -= *b
      adc edx, 0                // edx = carry
      sub eax, y                // eax -= y, set carry
      adc edx, 0                // edx += carry
      mov dword ptr[ecx], edx   // *b = borrow
   }
}

__forceinline Int32 CMP(UInt32 *x, UInt32 *y) { // MP_ numbers compared -0+
   Int32 i; UInt32 c;
   c = LEN(x) - LEN(y);
   if (c != 0)
      return LEN(x)>c ? 1 : -1;
   for (i = LEN(x)+XTR-1; i >= XTR; --i) {
      c = x[i] - y[i];
      if (c != 0)
         return x[i]>c ? 1 : -1;
   }
   return 0;
}
__forceinline Int32 shCMP(UInt32 *x, UInt32 *y, Int32 o, Int32 s) { // MP_ number x compared with y[+o]<<s
   Int32 i; UInt32 c;                        // s = shift #bits in words, o = ofset in words
   for (i = LST(x); i > o+XTR; --i) {        // from MS to LS words, ASSUME equal MS bits!!
      c = x[i] - ((y[i-o]<<s) | (y[i-o-1]>>(31-s)>>1));
      if (c != 0)                            // not equal, we can finish compare
         return x[i]>c ? 1 : -1;
   }                                         // all equal until LS word of shifted y
   c = x[o+XTR] - (y[XTR]<<s);               // partial shifted LS word of y
   if (c != 0)
      return x[o+XTR]>c ? 1 : -1;
   for (i = o+XTR-1; i >=XTR; --i)           // the rest of y == 0
      if (x[i] > 0) return 1;
   return 0;
}
__forceinline void shSUB(UInt32 *x, UInt32 *y, Int32 o, Int32 s) { // MP_INT x -= y[+o]<<s. x be larger!
   UInt32 i, b = 0;                          // s = shift #bits in words, o = ofset in words
   x[o+XTR] = SBB(x[o+XTR], y[XTR]<<s, &b);  // Subtract with borrow, partial shifted word
   for (i = o+XTR+1; i <= LST(x); ++i)       // from LSW to MSW, subtract with borrow
      x[i] = SBB(x[i], (y[i-o]<<s) | (y[i-o-1]>>(31-s)>>1), &b);
   b = 1;                                    // no final borrow, get MS word
   for (i = LST(x); i > XTR; --i)            // scan for MS non-zero word
      if (x[i] != 0) {
         b = i-XTR+1; break; }
   LEN(x) = b;                               // update length
}
__forceinline void shADD(UInt32 *x, UInt32 *y, Int32 o, Int32 s) { // MP_INT x+=y[+o]<<s. Unused words be 0!
   UInt32 c = 0, i, L;                       // s = shift #bits in words, o = ofset in words
   x[o+XTR] = ADC(x[o+XTR], y[XTR]<<s, &c);  // Add with carry, partial shifted word
   L = LST(y)+o; if (L < LST(x)) L = LST(x); // loop until both x and y are used up
   for (i = o+XTR+1; i <= L; ++i)            // from LS to MS words, add with carry to x the shifted y
      x[i] = ADC(x[i], (y[i-o]<<s) | (y[i-o-1]>>(31-s)>>1), &c);
   x[L+1] = ADC(x[L+1],y[L-o]>>(31-s)>>1,&c);// shifted y can be one longer
   LEN(x) = L-XTR+2;                         // update length, keep sign
   if (c > 0) {
      x[L+2] = c;                            // final carry
      LEN(x) += 1; }
   else if (x[L+1]==0)
      LEN(x) -= 1;                           // no carry, last set x was 0
}

// MP_INT: {sign,len,word[0],..word[len-1]}. e,f,x,y == all 0, each one longer than max(a,b)!
void xGCD(UInt32 **g, UInt32 **c, UInt32 **d,          // OUTPUT: g = gcd(a,b) = c*a + d*b
          UInt32 *a, UInt32 *b,                        // INPUT: a,b > 0, (destroyed)
          UInt32 *e, UInt32 *f, UInt32 *x, UInt32 *y) {// Work buffers: e,f, x,y of len = max(La,Lb)+1
   Int32 i, o, s, v; UInt32 *t;
   SET(e,0); SET(x,1); SGN(e) = -1;  // start with "-0"
   SET(f,1); SET(y,0); SGN(y) = -1;  // x-=(e<<s) or e-=(x<<s) will be OK
   for (;;) {
      i = CMP(a,b);
      if (i == 0)
         break;
      if (i < 0) {
         SWP(a,b); SWP(x,e); SWP(y,f); }
      _BitScanReverse(&s, MSW(a));   // index of MS bit, 0..31
      _BitScanReverse(&v, MSW(b));
      s -= v;
      o = LEN(a)-LEN(b);
      if (s < 0) { o -= 1; s += 32; }
      i = shCMP(a,b,o,s);
      if (i == 0)
         break;
      if (i < 0) s -= 1; if (s < 0) { o -= 1; s += 32; }
      shSUB(a,b,o,s);
      if (MSW(a)==0)
         break;
      shADD(x,e,o,s);
      shADD(y,f,o,s);
   }
   *g = b; *c = e; *d = f;
}

static void CPY(char *dest, char *src, Int32 count) { // copy chars
   Int32 i;
   for (i = 0; i < count; ++i)
      *dest++ = *src++;
}

static void SFL(UInt32 *y, Int32 Lx, UInt32 *x, UInt32 s) { // y <- x<<s, s<32
   Int32 i; UInt32 c = 0, t = 32-s;
   for (i = 0; i < Lx; ++i) {
      *y++ = (*x<<s) | c;
      c = *x++ >> t;
   }
   *y = c;
}

static void SFR(UInt32 *y, Int32 Lx, UInt32 *x, UInt32 s) { // y <- x>>s, s<32
   Int32 i; UInt32 c = 0, t = 32-s;
   y += Lx; x += Lx;
   for (i = 0; i < Lx; ++i) {
      *--y = c | (*--x>>s);
      c = *x << t;
   }
   *--y = c;
}


static UInt32 DivIpp(UInt32 y[], Int32 Lx, UInt32 x[], UInt32 d) { // y <- x/u
   Int32 i; UInt32 r = 0;
   for (i = Lx-1; i >= 0; --i)
      y[i] = UDIV(x[i],r,d,&r);
   return r;
}

static void DIVpq(UInt32 q[], Int32 Ly, UInt32 y[], Int32 Lx, UInt32 x[]) { // q <- y/x, y <- y%x
   Int32 i, j, k, n, m; UInt32 c, t, B, D, M, L; // ASSUME Ly >= Lx >= 2, y[Ly]==y[-1]==0
   UInt64 pp; UInt32 *p = (UInt32*)(&pp);        // p[0] = LS, p[1] = MS word of UInt64 pp

   _BitScanReverse(&n, x[Lx-1]); k = 31-n;       // MSb of x (n = 1..31), (.>>32 undefined)
   D = (x[Lx-1] << k) | (x[Lx-2]>>n>>1);         // normalize divisor by k = 31-n (0..30)
   B = (x[Lx-2] << k);                           // MS part of next digit of divisor
   if (Lx > 2) B |= (x[Lx-3]>>n>>1);             // use one more word, if can
   for (i = Ly; i >= Lx; --i) {                  // y[Ly] = 0
      M = (y[ i ] << k) | (y[i-1]>>n>>1);        // dividend << k
      L = (y[i-1] << k) | (y[i-2]>>n>>1);        // y is padded on both ends
      if (M < D) {                               // test (almost never overflow)
         t = UDIV(L,M,D,&c);                     // trial quotient digit (c <- mod) can be 1 too large
         pp = __emulu(t,B);                      // extra multiplication for test too large t
         if (p[1]>(c+1)) t--; }                  // correction when t is known too large
      else t = 0xffffffff;                       // max digit
      c = 0; m = i - Lx;                         // carry, index of q
      for (j = 0; j < Lx; ++j) {                 // reduce dividend
         pp = __emulu(x[j],t) + c;               // 64-bit product, with carry
         c = p[1] + (y[m+j] < p[0]);             // new 32-bit carry
         y[m+j] -= p[0]; }                       // reduce dividend digits
      if (y[i] != c) {                           // t too large: y[i] not cleared
         c = 0;                                  // clear carry
         for (j = 0; j < Lx; ++j)                // add back dividend
            y[m+j] = ADC(y[m+j],x[j],&c);        // using/updating c = carry
         t--; }                                  // decrement trial quotient
      y[i] = 0;                                  // MS word of y is cleared
      q[m] = t;                                  // store quotient digit
   }
}

static void DIVpp(UInt32 q[], Int32 Ly, UInt32 y[], Int32 Lx, UInt32 x[]) { // q <- y/x, y <- y%x
   Int32 i, j, n, k; UInt32 c, t, D, M, L;       // ASSUME Ly >= Lx >= 2, y[Ly]==y[-1]==0
   UInt64 pp; UInt32 *p = (UInt32*)(&pp);        // p[0] = LS, p[1] = MS word of UInt64 pp
   _BitScanReverse(&n, x[Lx-1]); k = 31-n;       // MSb of x (n = 1..31), (.>>32 undefined)
   D = (x[Lx-1] << k) | (x[Lx-2]>>n>>1);         // normalize divisor by k = 31-n (0..30)
   for (i = Ly; i >= Lx; --i) {                  // y[Ly] = 0
      M = (y[ i ] << k) | (y[i-1]>>n>>1);        // dividend << k
      L = (y[i-1] << k) | (y[i-2]>>n>>1);        // y is padded on both ends
      t = M < D ? UDIV(L,M,D,&c) : 0xffffffff;   // trial quotient digit (c <- mod) can be 1 too large
      c = 0;                                     // carry
      for (j = 0; j < Lx; ++j) {                 // reduce dividend
         pp = __emulu(x[j],t) + c;               // 64-bit product, with carry
         c = p[1] + (y[i-Lx+j] < p[0]);          // new 32-bit carry
         y[i-Lx+j] -= p[0]; }                    // reduce dividend digits
      if (y[i] != c) {                           // t too large: y[i] not cleared
         c = 0;                                  // clear carry
         for (j = 0; j < Lx; ++j)                // add back dividend
            y[j+i-Lx] = ADC(y[j+i-Lx],x[j],&c);  // using/updating c = carry
         t--; }                                  // decrement trial quotient
      y[i] = 0;                                  // MS word of y is cleared
      q[i-Lx] = t;                               // store quotient digit
   }
}

static void MULpp(UInt32 *z, Int32 Ly, UInt32 *y, Int32 Lx, UInt32 *x) { // z <- y*x
   Int32 i, j; UInt32 c, *yy, *zz; UInt64 p; UInt32 *q = (UInt32*)(&p)+1; // MS 32 bits of p
   for (i = 0; i < Lx; ++i) {
      c = 0; yy = y; zz = z + i;
      for (j = 0; j < Ly; ++j) {
         p = __emulu(*yy++,*x) + c;
         *zz += (UInt32)p;
         c = *q + (*zz++ < (UInt32)p);
      }
      *zz = c; ++x;
   }
}

static UInt32 MulIpp(UInt32 *y, Int32 Lx, UInt32 *x, UInt32 u) { // y <- x*u
   Int32 i; UInt32 c = 0; UInt64 p;
   for (i = 0; i < Lx; ++i) {
      p = __emulu(*x++,u) + c;
      c = (UInt32)(p >> 32);
      *y++ = (UInt32)p;
   }
   *y = c;
   return c;
}

static Int32 LENGTH(Int32 Lx, UInt32 x[]) { // <- index of nonzero MS word = 1..Len
   Int32 i;
   for (i = Lx-1; i >= 0; --i)
      if (x[i] > 0) return i+1;
   return 1;
}

static void ADDpp(UInt32 *z, Int32 Ly, UInt32 *y, Int32 Lx, UInt32 *x) { // z <- y+x
   Int32 i, c = 0;                       // ASSUMED: y > x, Len(z) = Ly+1
   for (i = 0; i < Lx; ++i)
      *z++ = ADC(*y++,*x++,&c);          // c = new carry
   for (i = 0; i < Ly-Lx; ++i)
      *z++ = ADC(*y++,0,&c);             // c = new carry
   *z = c;
}

static void SUBpp(UInt32 *z, Int32 Ly, UInt32 *y, Int32 Lx, UInt32 *x) { // z <- y-x
   Int32 i, b = 0;                       // ASSUMED: z is all 0, y > x: no final borrow
   for (i = 0; i < Lx; ++i)
      *z++ = SBB(*y++,*x++,&b);          // b = new borrow
   for (i = 0; i < Ly-Lx; ++i)
      *z++ = SBB(*y++,0,&b);
}

static Int32 CMPpp(Int32 Len, Int32 *eq, UInt32 x[], UInt32 y[]) { // <- -0+, eq <- #equal words
   Int32 i;
   *eq = 0;
   for (i = Len-1; i >= 0; --i)
      if (x[i] == y[i])
         *eq += 1;
      else
         return x[i]>y[i] ? 1 : -1;
   return 0;
}

static UInt32 AddIpp(UInt32 *y, Int32 Lx, UInt32 *x, UInt32 u) { // y <- x+u, return carry
   Int32 i;
   for (i = 0; i < Lx; ++i) {
      *y = u + *x++;
      u = *y++ < u; // carry
   }
   *y = u;
   return u;
}

static UInt32 SubIpp(UInt32 *y, Int32 Lx, UInt32 *x, UInt32 u) { // y <- x-u, return borrow
   Int32 i;
   for (i = 0; i < Lx; ++i) {
      *y++ = *x - u;
      u = *x++ < u; // borrow
   }
   return u;
}

static void UInt32toHex(UInt8 *hex, UInt32 u) { // in hex room for 5 bytes
   Int32 i; UInt32 d, f = 0;
   for (i=28; i>=0; i-=4) {
      d = (u >> i) & 15;
      f += d;
      if ((f>0) || (i==0)) {
         if (d > 9) *hex++ = d + 55;
         else       *hex++ = d + 48; }
   }
   *hex = 0;
}

static void UInt32to8Hex(UInt8 *hex, UInt32 u) { // in hex room for 5 bytes
   Int32 i, d;
   for (i=28; i>=0; i-=4) {
      d = (u >> i) & 15;
      if (d > 9) *hex++ = d + 55;
      else       *hex++ = d + 48;
   }
   *hex = 0;
}
