// quantum computing emulation: Bernstein & Vazirani problem: // // Determine the hidden bit string (a) of a parity function: f(x) = a*x + b // // Distributed over a set of processing nodes using MPI, // with multiple threads on each node using OpenMP. // // Usage: mpirun [mpi_options] ./BV [-d] [-r] [-c] [-s] [nx] // // mpi_options examples: // // -n 8 -H vecr:16 (use 8 processes, allow up to 16 on host vecr) // -n 4 -H node-001,node-002,node-003,node-004 (1 each on nodes 1-4) // // BV options: // // nx = number of qubits for x, default = 5; nx+1 total qubits // nc = number of bits for MPI cache index, default = 0 (no cache) // seed = srand() seed, default = 0 // -d extra debug output, -d -d for even more output // -r use random global phase factor // // Each MPI node will use the same srand() seed, so values produced by rand() // will be consistent between the nodes. // // R. Perry, March 2019 // #include #include // atoi(), rand() #include #include #ifdef _OPENMP #include #endif #include "qce.h" // abbreviations for some MPI constants: // // the global communicator, i.e. every process of the current run // double complex data type // ignore status // message tag // #define comm MPI_COMM_WORLD #define cmplx MPI_DOUBLE_COMPLEX #define nostat MPI_STATUS_IGNORE #define tag 0 // constants, initialized in main() // static double pi, s2; // quit: print error message and exit // void quit( const char *msg) { fprintf( stderr, "BV: %s\n", msg); MPI_Abort( comm, 1); exit(1); } // classical a*x + b // unsigned long f( unsigned long x, unsigned long a, unsigned long b) { unsigned long r = b; x &= a; while( x) { r ^= (x & 1); x >>= 1; } return r; } // generate random global phase factor // double complex phase( int rphase) { double complex r = 1; if( rphase) { double a = pi*(2*DRAND-1); /* -pi ... pi */ r = exp(-I*a); } return r; } // HHT: perform Hadamard transformation on qubit k, distributed on a pair of nodes, // using MPI_Sendrecv() for the data transfer, and T=2**t threads per node // https://www.open-mpi.org//doc/v3.1/man3/MPI_Sendrecv.3.php // // c is a cache of size C for the MPI data transfer // void HHT( State q, unsigned int k, unsigned int n, unsigned int r, double complex *c, unsigned int C, unsigned int t, unsigned long T) { if( k < q.n || k >= n) quit( "HH: index argument out of range"); unsigned int h, s, mask = 1U << (k-q.n); // r = my node number (rank), s = my partner node number // // my partner node mask bit is the opposite of mine // if( r & mask) { h = 1; s = r & ~mask; } // my mask bit is high else { h = 0; s = r | mask; } // my mask bit is low #if 0 for( unsigned long i = 0; i < q.N; ) // ++i is performed in the inner loop #endif for( unsigned long i = 0; i < q.N; i += C) { MPI_Sendrecv( q.a+i, C, cmplx, s, tag, c, C, cmplx, s, tag, comm, nostat); #ifdef _OPENMP #pragma omp parallel for num_threads(T) #endif for( unsigned long j = 0; j < C; ++j) { q.a[i+j] = s2 * ( h ? c[j]-q.a[i+j] : q.a[i+j]+c[j] ); } #if 0 // OLD for( unsigned long j = 0; j < C; ++j, ++i) { q.a[i] = s2 * ( h ? c[j]-q.a[i] : q.a[i]+c[j] ); } #endif } } // HT: perform Hadamard transformation on qubit k using T=2**t threads // void HT( State q, unsigned int k, unsigned int t, unsigned long T) { #ifdef _OPENMP if( k >= q.n) quit( "HT: index argument out of range"); unsigned int b = q.n-t; // low-order bits of amplitude index unsigned long k1 = 1LU << k; // k1 = 0...010...0, k'th bit = 1 if( k < b) #pragma omp parallel num_threads(T) { // linear start,stop thread assignment: ID*N/T = ID*(2**n)/(2**t) = ID*2**(n-t) // unsigned long ID = omp_get_thread_num(), i = ID << b, j, stop = (ID+1) << b; // printf( "ID %lu, k %u, start %lu, stop %lu\n", ID, k, i, stop); while( i < stop) // i will have k'th bit = 0, j will have k'th bit = 1 { j = i | k1; double complex t0 = q.a[i], t1 = q.a[j]; q.a[i] = s2*(t0+t1); q.a[j] = s2*(t0-t1); ++i; i += (i & k1); // skip i values with k'th bit = 1 } } else // k >= b { unsigned int d = k-b; // low-order bits of ID, d >= 0 #pragma omp parallel num_threads(T) { unsigned long ID = omp_get_thread_num(), j, L = 1LU << (b-1); // // bit layout: // <------------------------------- n -----------------------------------> // <------ t-d-1 ----------><- 1 -><----- d ------><--- 1 ----><-- b-1 --> // i = top (t-d-1) bits from ID, bit k, d bits from ID, bit 0 of ID, // ^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ // one of the indicated pieces could be empty depending on k // unsigned long i = ((ID >> (d+1)) << (k+1)) | (((ID >> 1) & ((1LU << d) - 1)) << b) | ((ID & 1) << (b-1)); // printf( "ID %lu, k %u, start %lu, L %lu\n", ID, k, i, L); while( L) { j = i | k1; double complex t0 = q.a[i], t1 = q.a[j]; q.a[i] = s2*(t0+t1); q.a[j] = s2*(t0-t1); ++i; --L; } } } #else // not _OPENMP H( q, k); #endif // _OPENMP } //--------------------------------------------------------------------- int main( int argc, char *argv[]) { // constants // pi = 4*atan(1); s2 = sqrt(2)/2; // MPI setup // int i; unsigned int P, r; MPI_Init( &argc, &argv); MPI_Comm_size( comm, &i); P = i; // number of processes MPI_Comm_rank( comm, &i); r = i; // my number and name: char name[MPI_MAX_PROCESSOR_NAME]; MPI_Get_processor_name( name, &i); // check P = 2**p // unsigned int p = 0; while( (1U << p) < P) ++p; if( r == 0) printf( "BV: %u processes\n", P); if( (1U << p) != P) quit( "number of processes is not a power of 2"); // MPI_Barrier( comm); // this does not necessarily make above printf come out first // check T = 2**t // unsigned long T = 1; #ifdef _OPENMP T = omp_get_max_threads(); // OMP_NUM_THREADS #endif unsigned int t = 0; while( (1LU << t) < T) ++t; if( (1LU << t) != T) quit( "number of threads is not a power of 2"); printf( "BV: %u %s %lu\n", r, name, T); // command-line options // unsigned int rphase = 0, debug = 0, seed = 0, nc = 0; for( i = 1; i < argc && argv[i][0] == '-'; ++i) { switch( argv[i][1]) { case 'd': ++debug; break; case 'r': ++rphase; break; case 's': seed = atoi(argv[i]+2); srand(seed); break; case 'c': nc = atoi(argv[i]+2); break; default: quit( "bad option"); } } // nx = number of qubits for x // n = nx + 1, total number of qubits // m = n - p, qubits per node // unsigned int m, n, nx = 5; if( i < argc) nx = atoi(argv[i]); if( nx < 1) quit( "bad nx arg"); n = nx + 1; if( n <= p) quit( "too many nodes, not enough qubits"); m = n - p; if( m <= t) quit( "too many threads, not enough qubits"); State q = Q(m); // q.n = m // nc = number of bits for MPI cache index // c = cache, C = cache size // if( nc > m) nc = m; unsigned int C = 1U << nc; double complex *c = malloc( C*sizeof(double complex)); if( !c) quit( "cache malloc failed\n"); if( r == 0) printf( "BV: nc = %u, C = %u = %u bytes\n", nc, C, (unsigned int)(C*sizeof(double complex)) ); // parity function parameters // unsigned long a = 1+IRAND((1LU<|1> // double complex v = phase( rphase); // all must do this to keep rand() sync'd zero(q); if( r == 0) q.a[1] = v; // proc 0 only char msg[100]; sprintf( msg, "%u: q", r); if( debug > 1) print( msg, q, 0); // apply H to all of the qubits // // first apply H to qubits which can be updated locally // // in general have to do this on every node, however at this point in this program // the q's are all zero except for proc 0, q.a[1], so we can just do that one: // if( r == 0) for( unsigned int k = 0; k < m; ++k) HT( q, k, t, T); // apply H to qubits which must be updated between pairs // for( unsigned int k = m; k < n; ++k) HHT( q, k, n, r, c, C, t, T); sprintf( msg, "%u: H", r); if( debug > 1) print( msg, q, 0); // perform (x,y) -> (x,y+f(x)) // unsigned long R = (unsigned long)r << (m-1); // my number in the high bits for( unsigned long i = 0; i < q.N; i += 2) if( f(R|(i>>1),a,b) ) swap( q, i, i|1); sprintf( msg, "%u: f", r); if( debug > 1) print( msg, q, 0); // apply H to all of the qubits // // first apply H to qubits which can be updated locally // for( unsigned int k = 0; k < m; ++k) HT( q, k, t, T); // apply H to qubits which must be updated between pairs // for( unsigned int k = m; k < n; ++k) HHT( q, k, n, r, c, C, t, T); sprintf( msg, "%u: q", r); if( debug) print( msg, q, 0); if( r == (ma >> m) ) // node which contains the final measured value { unsigned long mi = ma & ((1LU<