// compare speed of multiply for 22 12-bit pieces, with reduction at the end, // vs. 11 24-bit pieces, 17 16-bit, or 8 32-bit pieces, with reduction during the multiply // // assumes no 64-bit data type, unless use64 option is specified // // but mul and mul16 use only 32-bit data regardless of the use64 option // // 17 16-bit pieces are used instead of 16 pieces to handle the product of two sums // with no mod until the end, although the carries must be reduced during the multiply // // Usage: ./timing #count #nrand #seed check add use64 op|prod op16|prod16 op24|prod24 op32|prod32 // // defaults: 1000000 100 op op16 op24 op32 // // if nrand is 0 then maximum operand values 0xFF...FF are used // // if add option is specified only mul and mul16 are tested // // op|prod args select operand-scanning vs. product-scanning order (Comba) // // in operand-scanning order the elements of u and v are accessed in order: // // for i=0,1,...: for j=0,1,...: w[i+j] += u[i]*v[j] // // in product-scanning order the elements of w are accessed in order: // // for k=0,1,...: for t = 0,1,...: w[k] += u[i-t]*v[j+t] // // R. Perry, June 2017 #include #include #include #include #include #include // bit selection macros // #define LO16(u) ((u) & 0x0000FFFF) // bottom 16 bits #define HI16(u) ((u) >> 16) // top 16 bits #define LO32(u) ((u) & 0xFFFFFFFFLL) // bottom 32 bits #define HI32(u) ((u) >> 32) // top 32 bits #define LO24(u) ((u) & 0x00FFFFFF) // bottom 24 bits #define HI24(u) ((u) >> 24) // top 8 (or more) bits #define LO8(u) ((u) & 0x000000FF) // bottom 8 bits //------------------------------------------------------------------------------ // from impl.h,c #define B 12 // number of bits per piece #define MASK ((1<> B; w[i] &= MASK; } } // mul_op: operand-scanning order // void mul_op( D w, C u, C v) { for( int k = 0; k < NN; ++k) w[k] = 0; for( int i = 0; i < N; ++i) for( int j = 0; j < N; ++j) w[i+j] += u[i]*v[j]; reduce( w, NN); } // add12: w = u+v with no reduction // void add12( C w, C u, C v) { for( int i = 0; i < N; ++i) w[i] = u[i] + v[i]; } //------------------------------------------------------------------------------ // mul_prod: product-scanning order // void mul_prod( D w, C u, C v) { /* w[k] += u[i]*v[j] illustration using N = 4: k i j i starts at min(k,N-1) --- --- --- j starts at k-i 0 0 0 1 1, 0 0, 1 could use loop: 2 2, 1, 0 0, 1, 2 3 3, 2, 1, 0 0, 1, 2, 3 while( i >= 0 && j < N) w[k] += u[i--] * v[j++]; 4 3, 2, 1 1, 2, 3 5 3, 2 2, 3 6 3 3 7 final carry, if any (not needed when using 12-bit pieces because u[N-1] uses only 4 bits) (not needed when using 16-bit pieces because u[N16-1] uses only 1 bit) */ UI r = 0; for( int k = 0; k < NN; ++k) // note that NN is (2*N)-1, not 2*N { int i = (k < N) ? k : N-1, j = k-i, m = (N-j < i+1) ? N-j : i+1; for( int t = 0; t < m; ++t) r += u[i-t]*v[j+t]; w[k] = r & MASK; r >>= B; // final r value will be 0 } } //------------------------------------------------------------------------------ // 17 16-bit pieces #define N16 17 // u[N16-1] uses only 1 bit #define NN16 (2*N16-1) // mul16_op: operand-scanning order // void mul16_op( UI *w, UI *u, UI *v) { for( int k = 0; k < NN16; ++k) w[k] = 0; for( int i = 0; i < N16; ++i) { UI r = 0; for( int j = 0; j < N16; ++j) { int k = i+j; r += w[k] + u[i]*v[j]; w[k] = LO16(r); r = HI16(r); } if( i < N16-1) w[i+N16] = r; } } // mul16_prod: product-scanning order // void mul16_prod( UI *w, UI *u, UI *v) { UI r = 0, s = 0; for( int k = 0; k < NN16; ++k) { int i = (k < N16) ? k : N16-1, j = k-i, m = (N16-j < i+1) ? N16-j : i+1; for( int t = 0; t < m; ++t) { r += u[i-t]*v[j+t]; s += HI16(r); r = LO16(r); } w[k] = r; r = LO16(s); s = HI16(s); // store and shift } } // add16: w = u+v with reduction // void add16( UI *w, UI *u, UI *v) { UI r = 0; for( int i = 0; i < N16; ++i) { r += u[i] + v[i]; w[i] = LO16(r); r = HI16(r); } } //------------------------------------------------------------------------------ // helper routines for mul24 and mul32 // add 16-bit p to x (array of at least two 16-bit pieces) // static inline void add16p( UI *x, UI p) { x[0] += p; x[1] += HI16(x[0]); x[0] = LO16(x[0]); } // add 24-bit p to x (array of at least two 16-bit pieces) // static inline void add24p( UI *x, UI p) { UI plo = LO16(p), phi = HI16(p); x[0] += plo; x[1] += phi + HI16(x[0]); x[0] = LO16(x[0]); } // add 32-bit p to x (array of at least three 16-bit pieces) // static inline void add32p( UI *x, UI p) { UI plo = LO16(p), phi = HI16(p); x[0] += plo; x[1] += phi + HI16(x[0]); x[0] = LO16(x[0]); x[2] += HI16(x[1]); x[1] = LO16(x[1]); } //------------------------------------------------------------------------------ // 11 24-bit pieces #define N24 11 #define NN24 (2*N24) // no -1 here like NN, u[N24-1] is 16 bits // mul24_op: operand-scanning order // void mul24_op( UI *w, UI *u, UI *v) { for( int k = 0; k < NN24; ++k) w[k] = 0; for( int i = 0; i < N24; ++i) { UI x[4] = { 0, 0, 0, 0 }; // result and carry, 16-bit pieces for( int j = 0; j < N24; ++j) { int k = i+j; // work with 16-bit pieces (note that HI16(x) is only 8 bits) // x[0] += LO16(w[k]); x[1] += HI16(w[k]); UI ulo = LO16(u[i]), uhi = HI16(u[i]), vlo = LO16(v[j]), vhi = HI16(v[j]); add32p( x+0, ulo*vlo); // low part add24p( x+1, uhi*vlo + ulo*vhi); // middle parts add16p( x+2, uhi*vhi); // top part w[k] = (LO8(x[1]) << 16) | x[0]; // store 24 bits x[0] = (x[1] >> 8) | (LO8(x[2]) << 8); // shift by 24 bits x[1] = (x[2] >> 8) | (x[3] << 8); x[2] = x[3] = 0; } w[i+N24] = (LO8(x[1]) << 16) | x[0]; } } // mul24_op_64: operand-scanning order, 64-bit intermediate results // void mul24_op_64( UI *w, UI *u, UI *v) { for( int k = 0; k < NN24; ++k) w[k] = 0; for( int i = 0; i < N24; ++i) for( int j = 0; j < N24; ++j) { int k = i+j; unsigned long long r = u[i]; r *= v[j]; r += w[k]; w[k] = LO24(r); r = HI24(r); // unsigned long long y = w[k+1]; y += r; // debug testing w[k+1] += r; // this won't overflow // if( w[k+1] != y) printf( "bad\n"); } } // mul24_prod: product-scanning order // void mul24_prod( UI *w, UI *u, UI *v) { UI x[4] = { 0, 0, 0, 0 }; // result and carry, 16-bit pieces for( int k = 0; k < NN24; ++k) { int i = (k < N24) ? k : N24-1, j = k-i, m = (N24-j < i+1) ? N24-j : i+1; for( int t = 0; t < m; ++t) { // work with 16-bit pieces (note that HI16(x) is only 8 bits) // UI ulo = LO16(u[i-t]), uhi = HI16(u[i-t]), vlo = LO16(v[j+t]), vhi = HI16(v[j+t]); add32p( x+0, ulo*vlo); // low part add24p( x+1, uhi*vlo + ulo*vhi); // middle parts add16p( x+2, uhi*vhi); // top part } w[k] = (LO8(x[1]) << 16) | x[0]; // store 24 bits x[0] = (x[1] >> 8) | (LO8(x[2]) << 8); // shift by 24 bits x[1] = (x[2] >> 8) | (x[3] << 8); x[2] = x[3] = 0; } } // mul24_prod_64: product-scanning order, 64-bit intermediate results // void mul24_prod_64( UI *w, UI *u, UI *v) { unsigned long long r = 0; for( int k = 0; k < NN24; ++k) { int i = (k < N24) ? k : N24-1, j = k-i, m = (N24-j < i+1) ? N24-j : i+1; for( int t = 0; t < m; ++t) { r += ((unsigned long long) u[i-t]) * v[j+t]; } w[k] = LO24(r); r = HI24(r); } } //------------------------------------------------------------------------------ // 8 32-bit pieces #define N32 8 #define NN32 (2*N32) // no -1 here like NN, u[N32-1] is 32 bits // mul32_op: operand-scanning order // void mul32_op( UI *w, UI *u, UI *v) { for( int k = 0; k < NN32; ++k) w[k] = 0; for( int i = 0; i < N32; ++i) { UI x[5] = { 0, 0, 0, 0, 0 }; // result and carry, 16-bit pieces for( int j = 0; j < N32; ++j) { int k = i+j; // work with 16-bit pieces // x[0] += LO16(w[k]); x[1] += HI16(w[k]); UI ulo = LO16(u[i]), uhi = HI16(u[i]), vlo = LO16(v[j]), vhi = HI16(v[j]); add32p( x+0, ulo*vlo); // low part add32p( x+1, uhi*vlo); add32p( x+1, ulo*vhi); // middle parts add32p( x+2, uhi*vhi); // top part w[k] = (x[1] << 16) | x[0]; // store 32 bits x[0] = x[2]; x[1] = x[3]; x[2] = x[4]; x[3] = x[4] = 0; // shift by 32 bits } w[i+N32] = (x[1] << 16) | x[0]; } } // mul32_op_64: operand-scanning order, 64-bit intermediate results // void mul32_op_64( UI *w, UI *u, UI *v) { for( int k = 0; k < NN32; ++k) w[k] = 0; for( int i = 0; i < N32; ++i) { unsigned long long r = 0; for( int j = 0; j < N32; ++j) { int k = i+j; r += ((unsigned long long) u[i]) * v[j] + w[k]; w[k] = LO32(r); r = HI32(r); } w[i+N32] = r; } } // mul32_prod: product-scanning order // void mul32_prod( UI *w, UI *u, UI *v) { UI x[5] = { 0, 0, 0, 0, 0 }; // result and carry, 16-bit pieces for( int k = 0; k < NN32; ++k) { int i = (k < N32) ? k : N32-1, j = k-i, m = (N32-j < i+1) ? N32-j : i+1; for( int t = 0; t < m; ++t) { // work with 16-bit pieces // UI ulo = LO16(u[i-t]), uhi = HI16(u[i-t]), vlo = LO16(v[j+t]), vhi = HI16(v[j+t]); add32p( x+0, ulo*vlo); // low part add32p( x+1, uhi*vlo); add32p( x+1, ulo*vhi); // middle parts add32p( x+2, uhi*vhi); // top part } w[k] = (x[1] << 16) | x[0]; // store 32 bits x[0] = x[2]; x[1] = x[3]; x[2] = x[4]; x[3] = x[4] = 0; // shift by 32 bits } } // mul32_prod_64: product-scanning order, 64-bit intermediate results // void mul32_prod_64( UI *w, UI *u, UI *v) { unsigned long long s = 0; for( int k = 0; k < NN32; ++k) { unsigned long long r = LO32(s); s = HI32(s); int i = (k < N32) ? k : N32-1, j = k-i, m = (N32-j < i+1) ? N32-j : i+1; for( int t = 0; t < m; ++t) { r += ((unsigned long long) u[i-t]) * v[j+t]; s += HI32(r); r = LO32(r); } w[k] = r; } } //------------------------------------------------------------------------------ // print MSB first // void print( const char *msg, UI *u, int n, const char *fmt) { if( msg) printf( "%s = ", msg); for( int i = n-1; i >= 0; --i) printf( fmt, u[i]); putchar( '\n'); } // convert 22 12-bit pieces to 11 24-bit pieces // void convert24( UI *y, const UI *x) { for( int i = 0, j = 0; j < N; ++i, j += 2) y[i] = (x[j+1] << 12) | x[j]; } // convert 22 12-bit pieces to 16 16-bit pieces // void convert16( UI *y, const UI *x) { // x: x xxx xxx x xx xx x xxx xxx x xx xx x xxx xxx x xx xx x xxx // 21 20 19 18 17 16 15 14 13 12 11 10 9 8 // y: yyyy yyyy yyyy yyyy yyyy yyyy yyyy yyyy yyyy yyyy // 15 14 13 12 11 10 9 8 7 6 // // x: xxx x xx xx x xxx xxx x xx xx x xxx // 7 6 5 4 3 2 1 0 // y: yyyy yyyy yyyy yyyy yyyy yyyy // 5 4 3 2 1 0 // int i, j; for( i = 0, j = 0; j < N-1; i += 3, j += 4) { y[i] = (x[j+1] & 0xF) << 12 | x[j]; if( i == 15) break; y[i+1] = (x[j+2] & 0xFF) << 8 | x[j+1] >> 4; y[i+2] = x[j+3] << 4 | x[j+2] >> 8; } y[N16-1] = 0; } // convert 22 12-bit pieces to 8 32-bit pieces // void convert32( UI *y, const UI *x) { // x: x xxx xxx x xx xxx xxx xxx xxx xx x xxx xxx x xx xxx xxx // 21 20 19 18 17 16 15 14 13 12 11 10 9 8 // y: yyyyyyyy yyyyyyyy yyyyyyyy yyyyyyyy yyyyyyyy // 7 6 5 4 3 // // x: xxx xxx xx x xxx xxx x xx xxx xxx // 7 6 5 4 3 2 1 0 // y: yyyyyyyy yyyyyyyy yyyyyyyy // 2 1 0 // y[7] = x[21] << 28 | x[20] << 16 | x[19] << 4 | x[18] >> 8; y[6] = (x[18] & 0xFF) << 24 | x[17] << 12 | x[16]; y[5] = x[15] << 20 | x[14] << 8 | x[13] >> 4; y[4] = (x[13] & 0xF) << 28 | x[12] << 16 | x[11] << 4 | x[10] >> 8; y[3] = (x[10] & 0xFF) << 24 | x[9] << 12 | x[8]; y[2] = x[7] << 20 | x[6] << 8 | x[5] >> 4; y[1] = (x[5] & 0xF) << 28 | x[4] << 16 | x[3] << 4 | x[2] >> 8; y[0] = (x[2] & 0xFF) << 24 | x[1] << 12 | x[0]; } // return elapsed CPU time in seconds since the previous call // double cpu( void) { static double t0 = 0; struct rusage r; if( getrusage( RUSAGE_SELF, &r)) { perror( "getrusage"); exit(1); } // cumulative // double t = r.ru_utime.tv_sec + 1e-6*r.ru_utime.tv_usec + r.ru_stime.tv_sec + 1e-6*r.ru_stime.tv_usec; // delta // double dt = t - t0; t0 = t; return dt; } int main( int argc, char *argv[]) { // set options // // ./timing #count #nrand #seed check use64 op|prod op16|prod16 op24|prod24 op32|prod32 // int count = 1000000, nrand = 100, check, add, use64, prod, prod16, prod24, prod32; check = add = use64 = prod = prod16 = prod24 = prod32 = 0; int seed = time(0); if( argc > 1 && isdigit( argv[1][0])) { count = atoi( argv[1]); --argc; ++argv; } if( argc > 1 && isdigit( argv[1][0])) { nrand = atoi( argv[1]); --argc; ++argv; } if( argc > 1 && isdigit( argv[1][0])) { seed = atoi( argv[1]); --argc; ++argv; } for( int i = 1; i < argc; ++i) { if( strcmp( argv[i], "check") == 0) check = 1; else if( strcmp( argv[i], "add") == 0) add = 1; else if( strcmp( argv[i], "use64") == 0) use64 = 1; else if( strcmp( argv[i], "op") == 0) prod = 0; else if( strcmp( argv[i], "prod") == 0) prod = 1; else if( strcmp( argv[i], "op16") == 0) prod16 = 0; else if( strcmp( argv[i], "prod16") == 0) prod16 = 1; else if( strcmp( argv[i], "op24") == 0) prod24 = 0; else if( strcmp( argv[i], "prod24") == 0) prod24 = 1; else if( strcmp( argv[i], "op32") == 0) prod32 = 0; else if( strcmp( argv[i], "prod32") == 0) prod32 = 1; else { fprintf( stderr, "timing: bad arg: %s\n", argv[i]); fprintf( stderr, "Usage: ./timing #count #nrand #seed check use64 " "op|prod op16|prod16 op24|prod24 op32|prod32\n"); return 1; } } // multiplication functions, w = u*v // typedef void (*Mul)( D w, C u, C v); Mul mul = prod ? mul_prod : mul_op; Mul mul16 = prod16 ? mul16_prod : mul16_op; Mul mul24 = prod24 ? (use64 ? mul24_prod_64 : mul24_prod) : (use64 ? mul24_op_64 : mul24_op); Mul mul32 = prod32 ? (use64 ? mul32_prod_64 : mul32_prod) : (use64 ? mul32_op_64 : mul32_op); printf( "count = %d, nrand = %d, seed = %d, add = %d, ", count, nrand, seed, add); printf( "use64 = %d, prod,16,24,32 = %d %d %d %d\n", use64, prod, prod16, prod24, prod32); // create multiply inputs // srand(seed); int NRAND = (nrand <= 0) ? 1 : nrand; C u[NRAND], v[NRAND]; D w; if( nrand <= 0) // test using max values { u[0][N-1] = v[0][N-1] = 0xF; for( int i = 0; i < N-1; ++i) u[0][i] = v[0][i] = 0xFFF; } else // use random values { for( int k = 0; k < NRAND; ++k) { u[k][N-1] = rand() & 0xF; v[k][N-1] = rand() & 0xF; for( int i = 0; i < N-1; ++i) { u[k][i] = rand() & MASK; v[k][i] = rand() & MASK; } } } UI u16[NRAND][N16], v16[NRAND][N16], w16[NN16]; for( int k = 0; k < NRAND; ++k) { convert16( u16[k], u[k]); convert16( v16[k], v[k]); } UI u24[NRAND][N24], v24[NRAND][N24], w24[NN24]; for( int k = 0; k < NRAND; ++k) { convert24( u24[k], u[k]); convert24( v24[k], v[k]); } UI u32[NRAND][N32], v32[NRAND][N32], w32[NN32]; for( int k = 0; k < NRAND; ++k) { convert32( u32[k], u[k]); convert32( v32[k], v[k]); } if( check) { print( " u[0]", u[0], N, "%03x"); print( "u16[0]", u16[0], N16, "%04x"); print( " u24[0]", u24[0], N24, "%06x"); print( " u32[0] ", u32[0], N32, "%08x"); print( " v[0]", v[0], N, "%03x"); print( "v16[0]", v16[0], N16, "%04x"); print( " v24[0]", v24[0], N24, "%06x"); print( " v32[0] ", v32[0], N32, "%08x"); } double t = cpu(); if( add) // test mul() and mul16() multiplying two sums { C s1, s2; UI t1[NN16], t2[NN16]; // mul() // for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND) { add12( s1, u[k], u[k]); add12( s2, u[k], v[k]); mul( w, s1, s2); } t = cpu(); printf( " mul: %g secs\n", t); if( check) print( " w", w, NN, "%03x"); // mul16() // for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND) { add16( t1, u16[k], u16[k]); add16( t2, u16[k], v16[k]); mul16( w16, t1, t2); } t = cpu(); printf( "mul16: %g secs\n", t); if( check) print( "w16", w16, NN16, "%04x"); // exit // return 0; } // time mul() // for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND) mul( w, u[k], v[k]); t = cpu(); printf( " mul: %g secs\n", t); if( check) print( " w", w, NN, "%03x"); // time mul16() // for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND) mul16( w16, u16[k], v16[k]); t = cpu(); printf( "mul16: %g secs\n", t); if( check) print( "w16", w16, NN16, "%04x"); // time mul24() // for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND) mul24( w24, u24[k], v24[k]); t = cpu(); printf( "mul24: %g secs\n", t); if( check) print( "w24", w24, NN24, "%06x"); // time mul32() // for( int i = 0, k = 0; i < count; ++i, k = (k+1) % NRAND) mul32( w32, u32[k], v32[k]); t = cpu(); printf( "mul32: %g secs\n", t); if( check) print( " w32", w32, NN32, "%08x"); return 0; }