// unused/bad code


// set mul to OLD_mul (which works) or NEW_mul (which doesn't work)
//
void (*mul)(C r, C u, C v) = OLD_mul;


// mul: r = u*v with reduction and mod
//
//  w =   u[21]*x^21 + u[20]*x^20 + ... + u[1]*x + u[0]
//      * v[21]*x^21 + v[20]*x^20 + ... + v[1]*x + v[0]
//
//    =   u[21]*v[21]*x^42 + ... + (u[1]*v[0]+u[0]*v[1])*x + u[0]*v[0]
//
//    =         w[42]*x^42 + ... +                  w[1]*x +      w[0]
//
//  r = w mod 2^255 - 19
//
// w[42]...w[22] are not computed, they are folded into r[21]...r[0] with an implicit mod
// 
// *** this does not work ***
//
void NEW_mul( C r, C u, C v)
{
  int i, j, k, s;  UI t;

  for( k = 0; k < N; ++k) r[k] = 0;

  for( i = 0; i < N; ++i) for( j = 0; j < N; ++j)
  {
    k = i+j;  t = u[i]*v[j];

    if( k < N) { r[k] += t; }

    else { t *= 19; s = k-N+1; r[s] += t >> 3; r[s-1] += (t & 7) << (B-3); }
// ***
// this will not work with 22 12-bit pieces, intermediate results can overflow:
//
//   sum of two 12-bit pieces is 13-bits, product of two 13-bit pieces is
//   26-bits per piece, and 22 26-bit pieces have to be added in the worst case
//   which is (26+5)-bits = 31-bits
//
// and then the factor of 19 is another 5 bits, (31+5) = 36 bits total.
//
// Maybe rewrite everything using 32 8-bit pieces?  That will fit:
//
//   sum of two 8-bit pieces is 9-bits, product of two 9-bit pieces is
//   18-bits per piece, and 32 18-bit pieces have to be added in the worst case
//   which is (18+5)-bits = 23-bits
//
// and then the factor of 19 is another 5 bits, (23+5) = 28 bits total.
// ***
  }

  print( "mul1", r, N);

  reduce( r, N);

  // if( debug) print( "mul2", r, N);

  t = r[N-1] >> 3;

  if( t)
  {
    if( debug) printf( "mul: t = %x\n", t); // ZZ

    r[N-1] -= t << 3; r[0] += 19*t; 

    // if( debug) print( "mul3", r, N);

    reduce( r, N); // ZZ replace with "fast" reduce

    // if( debug) print( "mul4", r, N);
  }

 // compare r with m = (2^255) - 19 = 7 fff fff ... fff fed
 //
  if( r[N-1] < 7) return; // r < m

  for( int i = N-2; i > 0; --i) if( r[i] < 0xfff) return; // r < m

  if( r[0] < 0xfed) return; // r < m

  if( debug) printf( "mul: r >= m\n");

 // r >= m, subtract m
 //
  r[N-1] -= 7; r[0] -= 0xfed;

  for( int i = 1; i < N-1; ++i) r[i] -= 0xfff;
}

//------------------------------------------------------------------------------

// OLD_sub: w = u-v with no reduction
//
// *** assumes that B is 12 and N is 22, i.e. 2^255 = 2^3 * (2^12)^21 ***
//
// assumes that v is in the range 0 to m-1,
// 
// m = 2^255 - 19 = 7 fff fff ... fff fed
//
void OLD_sub( C w, C u, C v)
{
 // start with w = 1's complement of v
 //
  w[N-1] = (~v[N-1]) & 7;
  for( int i = 0; i < N-1; ++i) w[i] = (~v[i]) & 0xfff;

 // add 1 -> 2's complement, then add m, i.e. add m+1
 //
  w[0] += 0xfed + 1;
  w[N-1] += 7;
  w[N-1] &= 7; // mask out any overflow
  for( int i = 1; i < N-1; ++i) w[i] += 0xfff;

  if( debug) print( "sub1", w, N);

 // reduce and mask out any overflow
 //
  reduce( w, N); w[N-1] &= 7;

  if( debug) print( "sub2", w, N);

 // now w = m-v, add u
 //
  add( w, u, w);
}

//------------------------------------------------------------------------------

// curve group order L = 2^253 - L2
//
C L =
{
	0x3ed, 0xf5d, 0xa5c, 0x631, 0x812, 0xd65, 0x79c, 0xa2f, 0x9de, 0xdef, 0x014,
	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x1
};

C L2 =
{
	0xc13, 0x0a2, 0x5a3, 0x9ce, 0x7ed, 0x29a, 0x863, 0x5d0, 0x621, 0x210, 0xfeb,
	0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0x0
};

//------------------------------------------------------------------------------

// operations mod L, the curve group order
//
// L1 = 27742317777372353535851937790883648493
//
//  L = 2^252 + L1
//
//    = 1 000 000 000 000 000 000 000 000 000 000 014 def 9de a2f 79c d65 812 631 a5c f5d 3ed
//
//    = 2^252 + 2^252 - (2^252 - L1)
//
//    = 2^253 - L2
//
// L2 = 0 fff fff fff fff fff fff fff fff fff fff feb 210 621 5d0 863 29a 7ed 9ce 5a3 0a2 c13
//
//     21  20  19  18  17  16  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
//
// use the L2 formula for reducing mod L to avoid subtraction
// (L2 is actually the 2's complement of L1)
//
//  poly(L2) = L2[20]*x^20 + ... + L2[0], with x = 2^12,
//
//  2^253 - L2 = 2*x^21 - poly(L2)
//
//  divide into poly(w) = w[42]*x^42 + ... + w[1]*x + w[0]
//
//   = w[42]*2^11*x^20 + ... ; first remainder = poly(w[41...0]) + w[42]*2^11*x^20*poly(L2)
//
// ...

// mod_L: u = w mod (2^252 + 27742317777372353535851937790883648493)
//
// *** assumes that B is 12 and N is 22 ***
//
// NOTE: u may not be fully reduced but will not exceed (2^253 - 1)
//
// NOTE: this routine overwrites w
//
void mod_L( C u, D w)
{

// ZZ in progress, still has some bugs

  UI t; int k;

// NN = 43, N = 22
// i = 42, remainder = poly(w[41...0]) + w[42]*2^11*x^20*(L2[20]*x^20 + ... + L2[0])
// i = 41, remainder = poly(w[40...0]) + w[41]*2^11*x^19*(L2[20]*x^20 + ... + L2[0])
// ...
// i = 22, remainder = poly(w[21...0]) + w[22]*2^11*x^0*(L2[20]*x^20 + ... + L2[0])

  for( int i = NN-1; i >= N; --i) // w index
  {
    for( int j = 0; j < N-1; ++j) // L2 index, L2[N-1] = 0
    {
      t = w[i]*L2[j]; k = i+j-N; // remainder index
      w[k] += (t & 1) << 11;
      w[k+1] += (w[k] >> B) + (t >> 1); // carry + remainder
      w[k] &= MASK; // reduce
    }

// ? maybe the main mistake is that w[i] should be set to 0 here ?

    // w[k+1] &= MASK; // ZZ ? // mask out any overflow
  }
 
  for( int i = 0; i < N; ++i) u[i] = w[i]; // copy to u

  t = u[N-1] >> 1;

  while( t)
  {
    if( debug) printf( "mod_L: t = %x\n", t);

// 2*x^21 - L2 divided into u[21]*x^21 + ... + u[0]

    u[N-1] -= t << 1;

    for( int j = 0; j < N-1; ++j) // L2 index, L2[N-1] = 0
    {
      u[j] += t*L2[j];
      u[j+1] += u[j] >> B; // carry
      u[j] &= MASK; // reduce
    }

    t = u[N-1] >> 1; // in rare cases this will be non-zero
  }

  adjust_L( u);
}

// adjust_L: if u >= L then subtract m
//
// *** assumes that B is 12 and N is 22 ***
//
// also assumes that mod has already been performed
// so that u is in the range 0 ... (2^253 - 1)
//
void adjust_L( C u)
{
  for( int i = N-1; i >= 0; --i) if( u[i] < L[i]) return; // u < L

  if( debug) { print( "adjust_L", u, N); printf( "adjust_L: u >= L\n"); }

 // u >= L, subtract L, i.e. add L2
 //
  for( int i = 0; i < N-1; ++i) { u[i] += L2[i]; u[i+1] += (u[i] >> B); u[i] &= MASK; }

  u[N-1] &= 1; // mask out any overflow
}

//------------------------------------------------------------------------------