// compare speed of multiply for 22 12-bit pieces, with reduction at the end,
// vs. 11 24-bit pieces, 17 16-bit, or 8 32-bit pieces, with reduction during the multiply
//
// assumes no 64-bit data type, unless use64 option is specified
//
// but mul and mul16 use only 32-bit data regardless of the use64 option
//
// 17 16-bit pieces are used instead of 16 pieces to handle the product of two sums
// with no mod until the end, although the carries must be reduced during the multiply
//
// Usage: ./timing #count #nrand #seed check add use64 op|prod op16|prod16 op24|prod24 op32|prod32
//
// defaults: 1000000 100 op op16 op24 op32
//
// if nrand is 0 then maximum operand values 0xFF...FF are used
//
// if add option is specified only mul and mul16 are tested
//
// op|prod args select operand-scanning vs. product-scanning order (Comba)
// 
//   in operand-scanning order the elements of u and v are accessed in order:
//
//     for i=0,1,...: for j=0,1,...: w[i+j] += u[i]*v[j]
//
//   in product-scanning order the elements of w are accessed in order:
//
//     for k=0,1,...: for t = 0,1,...: w[k] += u[i-t]*v[j+t]
//
// R. Perry, June 2017

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <time.h>
#include <sys/resource.h>

// bit selection macros
//
#define LO16(u) ((u) & 0x0000FFFF)   // bottom 16 bits
#define HI16(u) ((u) >> 16)          // top 16 bits
#define LO32(u) ((u) & 0xFFFFFFFFLL) // bottom 32 bits
#define HI32(u) ((u) >> 32)          // top 32 bits
#define LO24(u) ((u) & 0x00FFFFFF)   // bottom 24 bits
#define HI24(u) ((u) >> 24)          // top 8 (or more) bits
#define LO8(u)  ((u) & 0x000000FF)   // bottom 8 bits

//------------------------------------------------------------------------------

// from impl.h,c

#define B     12	 // number of bits per piece
#define MASK  ((1<<B)-1) // mask to select B bits, e.g. 0...0111111111111
#define N     22	 // number of pieces
#define NN    (2*N-1)	 // number of pieces in a multiplication result
typedef unsigned int UI; // assume unsigned int is 32 bits
typedef UI C[N];	 // a 256-bit value using N B-bit pieces
typedef UI D[NN];	 // product of two C's

// reduce by propagating the carries
//
static inline void reduce( UI *w, int n)
{
  for( int i = 0; i < n-1; ++i) { w[i+1] += w[i] >> B; w[i] &= MASK; }
}

// mul_op: operand-scanning order
//
void mul_op( D w, C u, C v)
{
  for( int k = 0; k < NN; ++k) w[k] = 0;

  for( int i = 0; i < N; ++i) for( int j = 0; j < N; ++j) w[i+j] += u[i]*v[j];

  reduce( w, NN);
}

// add12: w = u+v with no reduction
//
void add12( C w, C u, C v)
{
  for( int i = 0; i < N; ++i) w[i] = u[i] + v[i];
}

//------------------------------------------------------------------------------

// mul_prod: product-scanning order
//
void mul_prod( D w, C u, C v)
{
/* w[k] += u[i]*v[j] illustration using N = 4:

  k   i            j              i starts at min(k,N-1)
 --- ---          ---             j starts at k-i
  0   0            0
  1   1, 0         0, 1           could use loop:
  2   2, 1, 0      0, 1, 2
  3   3, 2, 1, 0   0, 1, 2, 3     while( i >= 0 && j < N) w[k] += u[i--] * v[j++];
  4   3, 2, 1      1, 2, 3
  5   3, 2         2, 3
  6   3            3
  7   final carry, if any
      (not needed when using 12-bit pieces because u[N-1] uses only 4 bits)
      (not needed when using 16-bit pieces because u[N16-1] uses only 1 bit)
*/

  UI r = 0;

  for( int k = 0; k < NN; ++k) // note that NN is (2*N)-1, not 2*N
  {
    int i = (k < N) ? k : N-1,  j = k-i,  m = (N-j < i+1) ? N-j : i+1;

    for( int t = 0; t < m; ++t) r += u[i-t]*v[j+t];

    w[k] = r & MASK;  r >>= B; // final r value will be 0
  }
}

//------------------------------------------------------------------------------
  
// 17 16-bit pieces

#define N16    17	  // u[N16-1] uses only 1 bit
#define NN16   (2*N16-1)

// mul16_op: operand-scanning order
//
void mul16_op( UI *w, UI *u, UI *v)
{
  for( int k = 0; k < NN16; ++k) w[k] = 0;

  for( int i = 0; i < N16; ++i)
  {
    UI r = 0;

    for( int j = 0; j < N16; ++j)
    {
      int k = i+j;

      r += w[k] + u[i]*v[j]; w[k] = LO16(r); r = HI16(r);
    }

    if( i < N16-1) w[i+N16] = r;
  }
}

// mul16_prod: product-scanning order
//
void mul16_prod( UI *w, UI *u, UI *v)
{
  UI r = 0, s = 0;

  for( int k = 0; k < NN16; ++k)
  {
    int i = (k < N16) ? k : N16-1,  j = k-i,  m = (N16-j < i+1) ? N16-j : i+1;

    for( int t = 0; t < m; ++t) { r += u[i-t]*v[j+t]; s += HI16(r); r = LO16(r); }

    w[k] = r; r = LO16(s); s = HI16(s); // store and shift
  }
}

// add16: w = u+v with reduction
//
void add16( UI *w, UI *u, UI *v)
{
  UI r = 0;

  for( int i = 0; i < N16; ++i)
  {
    r += u[i] + v[i]; w[i] = LO16(r); r = HI16(r);
  }
}


//------------------------------------------------------------------------------

// helper routines for mul24 and mul32

// add 16-bit p to x (array of at least two 16-bit pieces)
//
static inline void add16p( UI *x, UI p)
{
  x[0] += p; x[1] += HI16(x[0]); x[0] = LO16(x[0]);
}
  
// add 24-bit p to x (array of at least two 16-bit pieces)
//
static inline void add24p( UI *x, UI p)
{
  UI plo = LO16(p), phi = HI16(p);

  x[0] += plo; x[1] += phi + HI16(x[0]); x[0] = LO16(x[0]);
}
  
// add 32-bit p to x (array of at least three 16-bit pieces)
//
static inline void add32p( UI *x, UI p)
{
  UI plo = LO16(p), phi = HI16(p);

  x[0] += plo; x[1] += phi + HI16(x[0]); x[0] = LO16(x[0]);

  x[2] += HI16(x[1]); x[1] = LO16(x[1]);
}

//------------------------------------------------------------------------------
  
// 11 24-bit pieces

#define N24    11
#define NN24   (2*N24)	// no -1 here like NN, u[N24-1] is 16 bits

// mul24_op: operand-scanning order
//
void mul24_op( UI *w, UI *u, UI *v)
{
  for( int k = 0; k < NN24; ++k) w[k] = 0;

  for( int i = 0; i < N24; ++i)
  {
    UI x[4] = { 0, 0, 0, 0 }; // result and carry, 16-bit pieces

    for( int j = 0; j < N24; ++j)
    {
      int k = i+j;

     // work with 16-bit pieces (note that HI16(x) is only 8 bits)
     //
      x[0] += LO16(w[k]); x[1] += HI16(w[k]);

      UI ulo = LO16(u[i]), uhi = HI16(u[i]), vlo = LO16(v[j]), vhi = HI16(v[j]);

      add32p( x+0, ulo*vlo); // low part

      add24p( x+1, uhi*vlo + ulo*vhi); // middle parts

      add16p( x+2, uhi*vhi); // top part

      w[k] = (LO8(x[1]) << 16) | x[0]; // store 24 bits

      x[0] = (x[1] >> 8) | (LO8(x[2]) << 8); // shift by 24 bits
      x[1] = (x[2] >> 8) | (x[3] << 8);
      x[2] = x[3] = 0;
    }

    w[i+N24] = (LO8(x[1]) << 16) | x[0];
  }
}

// mul24_op_64: operand-scanning order, 64-bit intermediate results
//
void mul24_op_64( UI *w, UI *u, UI *v)
{
  for( int k = 0; k < NN24; ++k) w[k] = 0;

  for( int i = 0; i < N24; ++i) for( int j = 0; j < N24; ++j)
  {
    int k = i+j;

    unsigned long long r = u[i]; r *= v[j]; r += w[k];

    w[k] = LO24(r); r = HI24(r);

    // unsigned long long y = w[k+1]; y += r; // debug testing

    w[k+1] += r; // this won't overflow

     // if( w[k+1] != y) printf( "bad\n");
  }
}

// mul24_prod: product-scanning order
//
void mul24_prod( UI *w, UI *u, UI *v)
{
  UI x[4] = { 0, 0, 0, 0 }; // result and carry, 16-bit pieces

  for( int k = 0; k < NN24; ++k)
  {
    int i = (k < N24) ? k : N24-1,  j = k-i,  m = (N24-j < i+1) ? N24-j : i+1;

    for( int t = 0; t < m; ++t)
    {
     // work with 16-bit pieces (note that HI16(x) is only 8 bits)
     //
      UI ulo = LO16(u[i-t]), uhi = HI16(u[i-t]), vlo = LO16(v[j+t]), vhi = HI16(v[j+t]);

      add32p( x+0, ulo*vlo); // low part

      add24p( x+1, uhi*vlo + ulo*vhi); // middle parts

      add16p( x+2, uhi*vhi); // top part
    }

    w[k] = (LO8(x[1]) << 16) | x[0]; // store 24 bits

    x[0] = (x[1] >> 8) | (LO8(x[2]) << 8); // shift by 24 bits
    x[1] = (x[2] >> 8) | (x[3] << 8);
    x[2] = x[3] = 0;
  }     
}

// mul24_prod_64: product-scanning order, 64-bit intermediate results
//
void mul24_prod_64( UI *w, UI *u, UI *v)
{
  unsigned long long r = 0;

  for( int k = 0; k < NN24; ++k)
  {
    int i = (k < N24) ? k : N24-1,  j = k-i,  m = (N24-j < i+1) ? N24-j : i+1;

    for( int t = 0; t < m; ++t)
    {
      r += ((unsigned long long) u[i-t]) * v[j+t];
    }

    w[k] = LO24(r); r = HI24(r);
  }
}
//------------------------------------------------------------------------------

// 8 32-bit pieces

#define N32    8
#define NN32   (2*N32)	// no -1 here like NN, u[N32-1] is 32 bits

// mul32_op: operand-scanning order
//
void mul32_op( UI *w, UI *u, UI *v)
{
  for( int k = 0; k < NN32; ++k) w[k] = 0;

  for( int i = 0; i < N32; ++i)
  {
    UI x[5] = { 0, 0, 0, 0, 0 }; // result and carry, 16-bit pieces

    for( int j = 0; j < N32; ++j)
    {
      int k = i+j;

     // work with 16-bit pieces
     //
      x[0] += LO16(w[k]); x[1] += HI16(w[k]);

      UI ulo = LO16(u[i]), uhi = HI16(u[i]), vlo = LO16(v[j]), vhi = HI16(v[j]);

      add32p( x+0, ulo*vlo); // low part

      add32p( x+1, uhi*vlo); add32p( x+1, ulo*vhi); // middle parts

      add32p( x+2, uhi*vhi); // top part

      w[k] = (x[1] << 16) | x[0]; // store 32 bits

      x[0] = x[2]; x[1] = x[3]; x[2] = x[4]; x[3] = x[4] = 0; // shift by 32 bits
    }

    w[i+N32] = (x[1] << 16) | x[0];
  }
}

// mul32_op_64: operand-scanning order, 64-bit intermediate results
//
void mul32_op_64( UI *w, UI *u, UI *v)
{
  for( int k = 0; k < NN32; ++k) w[k] = 0;

  for( int i = 0; i < N32; ++i)
  {
    unsigned long long r = 0;

    for( int j = 0; j < N32; ++j)
    {
      int k = i+j;

      r += ((unsigned long long) u[i]) * v[j] + w[k];

      w[k] = LO32(r); r = HI32(r);
    }

    w[i+N32] = r;
  }
}

// mul32_prod: product-scanning order
//
void mul32_prod( UI *w, UI *u, UI *v)
{
  UI x[5] = { 0, 0, 0, 0, 0 }; // result and carry, 16-bit pieces

  for( int k = 0; k < NN32; ++k)
  {
    int i = (k < N32) ? k : N32-1,  j = k-i,  m = (N32-j < i+1) ? N32-j : i+1;

    for( int t = 0; t < m; ++t)
    {
     // work with 16-bit pieces
     //
      UI ulo = LO16(u[i-t]), uhi = HI16(u[i-t]), vlo = LO16(v[j+t]), vhi = HI16(v[j+t]);
      
      add32p( x+0, ulo*vlo); // low part

      add32p( x+1, uhi*vlo); add32p( x+1, ulo*vhi); // middle parts

      add32p( x+2, uhi*vhi); // top part
    }

    w[k] = (x[1] << 16) | x[0]; // store 32 bits

    x[0] = x[2]; x[1] = x[3]; x[2] = x[4]; x[3] = x[4] = 0; // shift by 32 bits
  }     
}

// mul32_prod_64: product-scanning order, 64-bit intermediate results
//
void mul32_prod_64( UI *w, UI *u, UI *v)
{
  unsigned long long s = 0;

  for( int k = 0; k < NN32; ++k)
  {
    unsigned long long r = LO32(s);  s = HI32(s);

    int i = (k < N32) ? k : N32-1,  j = k-i,  m = (N32-j < i+1) ? N32-j : i+1;

    for( int t = 0; t < m; ++t)
    {
      r += ((unsigned long long) u[i-t]) * v[j+t];  s += HI32(r);  r = LO32(r);
    }

    w[k] = r;
  }
}

//------------------------------------------------------------------------------

// print MSB first
//
void print( const char *msg, UI *u, int n, const char *fmt)
{
  if( msg) printf( "%s = ", msg);

  for( int i = n-1; i >= 0; --i) printf( fmt, u[i]);

  putchar( '\n');
}

// convert 22 12-bit pieces to 11 24-bit pieces
//
void convert24( UI *y, const UI *x)
{
  for( int i = 0, j = 0; j < N; ++i, j += 2)
    y[i] = (x[j+1] << 12) | x[j];
}

// convert 22 12-bit pieces to 16 16-bit pieces
//
void convert16( UI *y, const UI *x)
{
// x: x xxx xxx x xx xx x xxx xxx x xx xx x xxx xxx x xx xx x xxx
//   21  20  19   18   17  16  15   14   13  12  11   10    9   8
// y:  yyyy  yyyy  yyyy  yyyy  yyyy  yyyy  yyyy  yyyy  yyyy  yyyy
//       15    14    13    12    11    10     9     8     7     6
//
// x: xxx x xx xx x xxx xxx x xx xx x xxx
//      7    6    5   4   3    2    1   0
// y:  yyyy  yyyy  yyyy  yyyy  yyyy  yyyy
//        5     4     3     2     1     0
//
  int i, j;

  for( i = 0, j = 0; j < N-1; i += 3, j += 4)
  {
    y[i]   = (x[j+1] & 0xF) << 12 | x[j];  if( i == 15) break;
    y[i+1] = (x[j+2] & 0xFF) << 8 | x[j+1] >> 4;
    y[i+2] =          x[j+3] << 4 | x[j+2] >> 8;
  }

  y[N16-1] = 0;
}

// convert 22 12-bit pieces to 8 32-bit pieces
//
void convert32( UI *y, const UI *x)
{
// x: x xxx xxx x xx xxx xxx xxx xxx xx x xxx xxx x xx xxx xxx
//   21  20  19   18  17  16  15  14   13  12  11   10   9   8
// y:    yyyyyyyy   yyyyyyyy   yyyyyyyy    yyyyyyyy   yyyyyyyy
//              7          6          5           4          3
//
// x: xxx xxx xx x xxx xxx x xx xxx xxx
//      7   6    5   4   3    2   1   0
// y:   yyyyyyyy    yyyyyyyy   yyyyyyyy
//             2           1          0
//
  y[7] = x[21] << 28 | x[20] << 16 | x[19] << 4 | x[18] >> 8;

  y[6] = (x[18] & 0xFF) << 24 | x[17] << 12 | x[16];

  y[5] = x[15] << 20 | x[14] << 8  | x[13] >> 4;

  y[4] = (x[13] & 0xF) << 28  | x[12] << 16 | x[11] << 4 | x[10] >> 8;

  y[3] = (x[10] & 0xFF) << 24 | x[9] << 12  | x[8];

  y[2] = x[7] << 20  | x[6] << 8   | x[5] >> 4;

  y[1] = (x[5] & 0xF) << 28   | x[4] << 16  | x[3] << 4  | x[2] >> 8;

  y[0] = (x[2] & 0xFF) << 24  | x[1] << 12  | x[0];
}

// return elapsed CPU time in seconds since the previous call
//
double cpu( void)
{
  static double t0 = 0;

  struct rusage r;  if( getrusage( RUSAGE_SELF, &r)) { perror( "getrusage"); exit(1); }

 // cumulative
 //
  double t = r.ru_utime.tv_sec + 1e-6*r.ru_utime.tv_usec
           + r.ru_stime.tv_sec + 1e-6*r.ru_stime.tv_usec;

 // delta
 //
  double dt = t - t0;  t0 = t;

  return dt;
}

int main( int argc, char *argv[])
{
 // set options
 //
 // ./timing #count #nrand #seed check use64 op|prod op16|prod16 op24|prod24 op32|prod32
 //
  int count = 1000000, nrand = 100, check, add, use64, prod, prod16, prod24, prod32;

  check = add = use64 = prod = prod16 = prod24 = prod32 = 0;

  int seed = time(0);

  if( argc > 1 && isdigit( argv[1][0])) { count = atoi( argv[1]); --argc; ++argv; }

  if( argc > 1 && isdigit( argv[1][0])) { nrand = atoi( argv[1]); --argc; ++argv; }

  if( argc > 1 && isdigit( argv[1][0])) { seed  = atoi( argv[1]); --argc; ++argv; }

  for( int i = 1; i < argc; ++i)
  {
    if(      strcmp( argv[i], "check")  == 0) check  = 1;
    else if( strcmp( argv[i], "add")    == 0) add    = 1;
    else if( strcmp( argv[i], "use64")  == 0) use64  = 1;
    else if( strcmp( argv[i], "op")     == 0) prod   = 0;
    else if( strcmp( argv[i], "prod")   == 0) prod   = 1;
    else if( strcmp( argv[i], "op16")   == 0) prod16 = 0;
    else if( strcmp( argv[i], "prod16") == 0) prod16 = 1;
    else if( strcmp( argv[i], "op24")   == 0) prod24 = 0;
    else if( strcmp( argv[i], "prod24") == 0) prod24 = 1;
    else if( strcmp( argv[i], "op32")   == 0) prod32 = 0;
    else if( strcmp( argv[i], "prod32") == 0) prod32 = 1;
    else { fprintf( stderr, "timing: bad arg: %s\n", argv[i]);
      fprintf( stderr, "Usage: ./timing #count #nrand #seed check use64 "
        "op|prod op16|prod16 op24|prod24 op32|prod32\n");
      return 1;
    }
  }

 // multiplication functions, w = u*v
 //
  typedef void (*Mul)( D w, C u, C v);

  Mul mul   =  prod  ? mul_prod : mul_op;

  Mul mul16 = prod16 ? mul16_prod : mul16_op;

  Mul mul24 = prod24 ? (use64 ? mul24_prod_64 : mul24_prod) : (use64 ? mul24_op_64 : mul24_op);

  Mul mul32 = prod32 ? (use64 ? mul32_prod_64 : mul32_prod) : (use64 ? mul32_op_64 : mul32_op);

  printf( "count = %d, nrand = %d, seed = %d, add = %d, ", count, nrand, seed, add);

  printf( "use64 = %d, prod,16,24,32 = %d %d %d %d\n", use64, prod, prod16, prod24, prod32);

 // create multiply inputs
 //
  srand(seed);

  int NRAND = (nrand <= 0) ? 1 : nrand;

  C u[NRAND], v[NRAND]; D w;

  if( nrand <= 0) // test using max values
  {
    u[0][N-1] = v[0][N-1] = 0xF;

    for( int i = 0; i < N-1; ++i) u[0][i] = v[0][i] = 0xFFF;
  }
  else // use random values
  {
    for( int k = 0; k < NRAND; ++k)
    {
      u[k][N-1] = rand() & 0xF; v[k][N-1] = rand() & 0xF;

      for( int i = 0; i < N-1; ++i) { u[k][i] = rand() & MASK; v[k][i] = rand() & MASK; }
    }
  }

  UI u16[NRAND][N16], v16[NRAND][N16], w16[NN16];

  for( int k = 0; k < NRAND; ++k)
  {
    convert16( u16[k], u[k]); convert16( v16[k], v[k]);
  }

  UI u24[NRAND][N24], v24[NRAND][N24], w24[NN24];

  for( int k = 0; k < NRAND; ++k)
  {
    convert24( u24[k], u[k]); convert24( v24[k], v[k]);
  }

  UI u32[NRAND][N32], v32[NRAND][N32], w32[NN32];

  for( int k = 0; k < NRAND; ++k)
  {
    convert32( u32[k], u[k]); convert32( v32[k], v[k]);
  }

  if( check)
  {
    print( "    u[0]",     u[0],   N, "%03x");
    print( "u16[0]", u16[0], N16, "%04x");
    print( "  u24[0]",   u24[0], N24, "%06x");
    print( "   u32[0] ", u32[0], N32, "%08x");
    print( "    v[0]",     v[0],   N, "%03x");
    print( "v16[0]", v16[0], N16, "%04x");
    print( "  v24[0]",   v24[0], N24, "%06x");
    print( "   v32[0] ", v32[0], N32, "%08x");
  }

  double t = cpu();

  if( add) // test mul() and mul16() multiplying two sums
  {
    C s1, s2; UI t1[NN16], t2[NN16];

   // mul()
   //
    for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND)
    {
      add12( s1, u[k], u[k]); add12( s2, u[k], v[k]); mul( w, s1, s2);
    }

    t = cpu(); printf( "  mul: %g secs\n", t);

    if( check) print( "     w", w, NN, "%03x");

   // mul16()
   //
    for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND)
    {
      add16( t1, u16[k], u16[k]); add16( t2, u16[k], v16[k]); mul16( w16, t1, t2);
    }

    t = cpu(); printf( "mul16: %g secs\n", t);

    if( check) print( "w16",  w16, NN16, "%04x");

   // exit
   //
    return 0;
  }

 // time mul()
 //
  for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND) mul( w, u[k], v[k]);

  t = cpu(); printf( "  mul: %g secs\n", t);

  if( check) print( "     w", w, NN, "%03x");

 // time mul16()
 //
  for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND) mul16( w16, u16[k], v16[k]);

  t = cpu(); printf( "mul16: %g secs\n", t);

  if( check) print( "w16", w16, NN16, "%04x");

 // time mul24()
 //
  for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND) mul24( w24, u24[k], v24[k]);

  t = cpu(); printf( "mul24: %g secs\n", t);

  if( check) print( "w24", w24, NN24, "%06x");

 // time mul32()
 //
  for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND) mul32( w32, u32[k], v32[k]);

  t = cpu(); printf( "mul32: %g secs\n", t);

  if( check) print( "    w32", w32, NN32, "%08x");

  return 0;
}