/*
This is a new benchmark program for LAPACK and BLAS, intended to provide
a quick rating of a machine's floating point performance, or conversely,
to quickly (but roughly) evaluate the effect of an upgraded linear algebra
library.  

Similar to the official benchmark, it devotes approximately equal time to: 
    Matrix multiplication using DGEMM (level 3 BLAS)
    Solve linear equation using DGESV
    Eigenvalues and eigenvectors using DGEEV
It does one or two trial runs to assess timing, then puts most of the assigned
time into one big matrix.  It reports seconds/N^3 for this matrix (N=size).  

Command line args (defaults shown):
    -t 30	How long (floating point seconds) to run the test.
    -v		Print detailed timing (for debugging).
    -V		Print matrices in full (for debugging, very prolix).
    -W		Same as -V but exit after the first matrix whether good or bad.
    -N 10	Start trial runs with this matrix size.  
    -m 0	Abort after this many tests (for debugging), 0->2^32 (infinite)
    -c		Check the results (for debugging), omit for a speed test.
    -e 1e-12	Relative error needed to pass the checks.  
    -s "dgemm,dgesv,dgeev"	Tests to do, lower case, comma separated
    -I time()	Random seed value, for repeatable inputs when debugging.
		If omitted, the current time is used.  
*/

/* Selected results from CouchNet machines (2015-03-24):  
All without checking, about 10 secs per matrix
	DGEMM		DGESV		DGEEV		CPU
	N     Sec/N^3	N     Sec/N^3	N     Sec/N^3	
Diamond	2212  8.592e-10	3076  2.825e-10	896   7.727e-09	i7-3517UE 1.70GHz x2
Jacinth	1018  8.739e-09	1538  2.822e-09	512   7.578e-08	AMD G-T40E 1.0GHz x2
Piki	1238  4.691e-09	1901  1.527e-09	566   3.284e-08	AMD Athlon 2650e 1.6GHz
Aurora	1308  3.972e-09	2023  1.315e-09	630   2.832e-08	AMD Ath 6850e 1.8GHz x2
Iris	1165  5.839e-09	1842  1.888e-09 550   3.609e-08	AMD G-T56N 1.65GHz x2
Kermit	1272  4.174e-09	1901  1.450e-09 248   4.662e-08 AMD E-350 1.60GHz x2
Xena	2324  7.708e-10 3220  2.485e-10 938   6.502e-09	i7-3632QM 2.20GHz x4

For the above machines, comparing Sec/N^3, DGESV/DGEMM = 0.33 very consistently,
while DGEEV/DGEMM (average) = 8.2 (range 6.2 .. 11.2).  For the overall score 
I'll use the average (over tests done) of (Sec/N^3)/factor.  

In the "speedsha" benchmark, the integer and I/O measures have units of
kbytes/sec.  Letting SN = Sec/N^3, it turns out that 0.1/SN equals the integer
score (for 1 core, since the benchmark uses only one core also) within +-20%,
except Iris is half as fast, Jacinth is 1/3 as fast, and Kermit is 5 times
faster than its integer score.  For matrix multiplication, each step requires
fetching 2 x 8 bytes, most likely from cache, multiplying them, and adding
the product to a sum in a register, which on the old 8087 coprocessor was done 
in parallel.  The integer task requires running each byte through the SHA-512
algorithm, which is more complicated than a floating multiply and add. 

Diamond's equivalent kbyte/sec: 116387

Results from some Mathnet machines (2015-03-24):
Host		sha512	sha*cor	I/O	Sum	Lapmark	CPU
achilles	199200	796800	10507	179553	174856	i7-4770S 3.10GHz x4
aries		170294	681176	11179	154596	153785	i7-4790S 3.20GHz x4
cepeda		163063	326126	9886	106857	127741	i5-2400  3.10GHz x2
aegean		37146	74292	9455	27943	98141	i5-660 3.33GHz x2
joshua01	75396	452376	10631	90136	87731	Xeon X5650 2.67GHz x6
pong		218925	218925	10421	114673	84341	E8500 3.16GHz x1
julia		93275	373100	12220	87725	73481	Xeon E5520 2.27GHz x4
gojira		73456	293824	8926	68737	64164	Xeon X7350 2.93GHz x4
nemo01		51866	51866	4601	28233	41930	Xeon 3.40GHz x1
laguna		19279	19279	5650	12464	25198	Xeon 3.06GHz x2
koala		6023	12046	6808	7168	11270	Atom Z550 2.0GHz x1
harlech		16871	16871	5642	11256	11003	Xeon 3.06GHz x1

	
*/

#include <stdlib.h>	/* for drand48 () */
#include <math.h>	/* for sqrt() */
#include <sys/time.h>	/* for gettimeofday and struct time */
#include <string.h>	/* for memcpy */
#include <stdio.h>	/* for printf and friends */
#include <unistd.h>	/* for getopt */
#include <mcheck.h>	/* malloc checking functions */

/* Get command line arguments.  opt['x'] points to a value of the appropriate
    type, holding the value for -x, or an arbitrary pointer if -x is not
    supposed to have a value, or it's NULL if -x was not specified.  */
#define NOPT 128
static const void* opt[NOPT] = {NULL,};	/*opt['x'] = -x value, "1" if lack val*/
extern char* optarg;
extern int optint;
void jgetopt(
    int argn,		/* Number of command line args including argv[0] */
    char* const argv[]	/* Text of the command line args */
) {
    signed char c;
    const void* value;
    int j;
		/* Set the default values for some parameters. */
    static char* const defaults[] = 
	{"", "-e", "1e-12", "-t", "30", "-N", "10", 
	    "-m", "2147483647", "-s", "dgemm,dgesv,dgeev", NULL};
    if (!opt['e']) {
	opt['e'] = &c;
	jgetopt(sizeof(defaults)/sizeof(*defaults)-1, defaults);
	opt['I'] = value = malloc(sizeof(int));
	time_t now;
	time(&now);
	*(int*)value = now;
    }
    static const char optstr[] = "cvVWe:t:N:m:s:I:";
    optind = 1;
    while ((c = getopt(argn, argv, optstr)) >= 2) {
	if (c >= NOPT) continue;
	if (!optarg) {
	    opt[c] = "1";
	} else {
	    switch (c) {
		case 'e':
		case 't':
		    opt[c] = value = malloc(sizeof(double));
		    sscanf(optarg, "%lf", (double*)value);
		    break;
		case 'm':
		case 'I':
		case 'N':
		    opt[c] = value = malloc(sizeof(int));
		    sscanf(optarg, "%d", (int*)value);
		    break;
		default:
		    opt[c] = strdup(optarg);
		    break;
	    }
	}
    }
    if (opt['W']) opt['V'] = opt['W'];
}

/* Takes the difference of two struct timeval.  Result is (double)seconds. */
double timediff(
    struct timeval* now,
    struct timeval* prev
) {
    return (now->tv_sec - prev->tv_sec) + 1e-6*(now->tv_usec - prev->tv_usec);
}

/* Allocates a vector and stuffs it with random values. */
double* vecget(		/* Returns a pointer to the vector, or NULL if failed.*/
			/* The caller must free() the result.*/
    int N		/* Number of elements needed */
) {
    double* vec = malloc(sizeof(double)*N);
    double* cpnt = vec;
    while (--N >= 0) {
	*cpnt++ = drand48();
    }
    return vec;
}

/* Does one matrix multiplication using a naive algorithm. */
void matxmat_inC(
    const int N,	/* Dimension of all the matrices */
    double* C,		/* Matrix product goes here */
    double* A,		/* Left factor matrix */
    double* B		/* Right factor matrix */
) {
    int i, j, k;
    double* a, *b, *c = C;
    double t;
    for (i = 0; i < N; ++i) {
	for (j = 0; j < N; ++j) {
	    t = 0.0;
	    a = A + i*N;
	    b = B + j;
	    for (k = 0; k < N; ++k) {
		t += *a++ * *b;
		b += N;
	    }
	    *c++ = t;
	}
    }
}

/* Does one matrix multiplication using a naive algorithm. 
    In "C", A[i][j] comes just before A[i][j+1] while A[i+1][j] is N cells down.
    The way arrays are laid out in Fortran, 
    C[i][j] = sum(k) A[k][j] * B[i][k] (using "C" subscripting)
    But they would write it
    C(i,j) = sum(k) A(j,k) * B(k,i) 
    In Fortran, changing the first subscript by 1 moves to the next cell, while
    incrementing the second subscript moves N cells down.  
    Semantically, the first subscript selects 1 row and the second subscript
    goes to elements in that row, or the second subscript selects a column
    and the first subscript selects its members.  Thus columns are contiguous
    in Fortran.  This is backwards from "C".  
*/
void matxmat(
    const int N,	/* Dimension of all the matrices */
    const int Q,	/* Number of rows in B and C, give 1 for a vector */
    const int tr,	/* What's transposed: 1 = A, 2 = B, 4 = C */
			/* 0 = "C" indexing, 1,2,4 = Fortran indexing. */
			/* When Q == 1, only bit 1 has an effect. */
    double* C,		/* Matrix product goes here (NxQ) */
    double* A,		/* Left factor matrix (NxN) */
    double* B		/* Right factor matrix (NxQ) */
) {
    int i, j, k;
    double* a, *b, *c;
    double t;
    int da = ((tr & 1) ? N : 1), ia = ((tr & 1) ? 1 : N);
    int db = ((tr & 2) ? 1 : Q), ib = ((tr & 2) ? Q : 1);
    int dc = ((tr & 4) ? Q : 1), ic = ((tr & 4) ? 1 : Q);
    for (i = 0; i < N; ++i) {
	c = C + i*ic;
	for (j = 0; j < Q; ++j) {
	    t = 0.0;
	    a = A + i*ia;
	    b = B + j*ib;
	    for (k = 0; k < N; ++k) {
		t += *a * *b;
		a += da;
		b += db;
	    }
	    *c = t;
	    c += dc;
	}
    }
}

/* AXPY: multiply a matrix by a vector and accumulate the result. 
    C += sign * A * B  (sign doesn't have to be +-1) */
void axpy(
    const int N,	/* Dimension of all the matrices */
    double* C,		/* Matrix product goes here, vector(N), init to 0.0 */
    double* A,		/* Left factor matrix (NxN) */
    double* B,		/* Right factor, vector(N) */
    double sign		/* Multiply the product by this before adding to C */
) {
    int i, j, k;
    double* a, *b, *c = C;
    double t;
    for (j = 0; j < N; ++j) {
	t = 0.0;
	b = B;
	a = A + j;
	for (k = 0; k < N; ++k) {
	    t += *b++ * *a;
	    a += N;
	}
	*c++ += sign * t;
    }
}

/* Compares two vectors or matrices.  If the RMS value of their difference
    is over eps * the average of their RMS values, an error is signalled. */
int compare(		/* Returns -1 on error, 0 on success */
    double eps,		/* Relative error allowed */
    const int N,	/* Number of components (N*N for matrices) */
    double* A,		/* First vector */
    double* B,		/* Second vector */
    const char* msg	/* A label for the error message, no ending newline */
) {
    double Arms = 0.0, Brms = 0.0, Drms = 0.0, t;
    int rc = 0;
    int i;
    eps *= sqrt((double)N); /* Inflate allowed error for multiple comparisons */
    for (i = 0; i < N; ++i) {
	Arms += *A * *A;
	Brms += *B * *B;
	t = *A++ - *B++;
	Drms += t * t;
    }
    if (Arms <= 0.0) Arms = 0.0;
    Arms = sqrt(Arms);
    if (Brms <= 0.0) Brms = 0.0;
    Brms = sqrt(Brms);
    t = (Arms > Brms) ? Arms : Brms;
    Drms = sqrt(Drms);
		/* Signal an error if the relative difference is too big.
		    Including if both operands are totally zero. */
    if (Drms  >= eps*t) {
	printf("Diff RMS %10.3e, max allowed %10.3e (relative %10.3e) for %s\n",
	    Drms, eps*t, eps, msg);
	rc = -1;
    }
    return rc;
}

/* Prints a matrix (or vector) on stdout. */
void printmat(
    const int N,	/* Dimension of the rows (adjacent elements) */
    const int R,	/* Number of rows, give 1 for a vector */
    const double* A,	/* The matrix to be printed */
    const char* label	/* A title line.  A preceeding and trailing \n is
			provided by the subroutine. */
) {
    int i, j;
    const double* a = A;
    printf("\n%s\n", label);
    fputs("         0", stdout);
    for (j = 1; j < N; ++j) {
	printf("%10d", j);
    }
    fputs("\n", stdout);
    for (i = 0; i < R; ++i) {
	printf("%4d ", i);
	for (j = 0; j < N; ++j) {
	    printf("%10.3e", *a++);
	}
	fputs("\n", stdout);
    }
}


/* Matrix multiplication: C = alpha * A * B + beta*C (with beta = 0) */
extern void dgemm_(
    const char* transa,	/* For A, 'N' if normal, 'T' or 'C' if transposed */
    const char* transb,	/* Same for B */
    const int* M,	/* Number of rows in A and C */
    const int* N,	/* Number of columns in B and C */
    const int* K,	/* Number of columns of A and rows of B */
    const double* alpha, /* Multiplier of the product */
    const double* A,	/* Matrix A (left factor) */
    const int* lda,	/* Row length of A (at least M) */
    const double* B,	/* Matrix B (right factor) */
    const int* ldb,	/* Row length of B (at least K) */
    const double* beta,	/* Multiplier for original C, 0 to not preset C */
    double* C,		/* Matrix C for result and accumulation */
    const int* ldc);	/* Row length of C (at least M) */
	/* In Fortran, row elements are adjacent and columns are separated by
	the row length (there could be unused cells at the end of each row). */

/* Does one simplified matrix multiplication using DGEMM */
/* With checking takes about 2.4x without checking using stock BLAS. */
int dgemm_test(		/* Returns "info" output, always 0 for DGEMM */
    const int N,	/* Dimension of all the matrices */
    double* stg[],	/* Storage for matrices */
    const char* label	/* Description for error message */
) {
    if (!stg[0]) {
	stg[0] = vecget(N*N);			/* First factor (A) */
	stg[1] = vecget(N*N);			/* Second factor (B) */
	stg[2] = malloc(sizeof(double)*N*N);	/* Product here (C) */
	stg[3] = malloc(sizeof(double)*N*N);	/* Product here for chk (C2) */
    }
    double *A = stg[0];				/* First factor */
    double *B = stg[1];				/* Second factor */
    double* C  = stg[2];			/* Product here */
    char label2[80];
    int cmp;
    const char en = 'N';
    const double alpha = 1.0, beta = 0.0;
    dgemm_(&en, &en, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);

    if (opt['c']) {
	double* C2  = stg[3];			/* Product here for checking */
	matxmat(N, N, 7, C2, A, B);
	cmp = compare(*(double*)(opt['e']), N*N, C, C2, label);
	if (opt['V'] || cmp) {
	    snprintf(label2, sizeof(label2), "Factor A (%s)", label);
	    printmat(N, N, A, label2);
	    snprintf(label2, sizeof(label2), "Factor B (%s)", label);
	    printmat(N, N, B, label2);
	    snprintf(label2, sizeof(label2), "Product (dgemm) (%s)", label);
	    printmat(N, N, C, label2);
	    snprintf(label2, sizeof(label2), "Product (check) (%s)", label);
	    printmat(N, N, C2, label2);
	}
	if (cmp) exit(4);
    }
    return 0;
}
    
/* Solve linear equations: Find X such that A * X = B
    Uses LU decomposition internally.  */
extern void dgesv_ (
    const int* N,	/* Dimension of A (square), X, B (rows only) */
    const int* NRHS,	/* Column dimension of X and B */
    double* A,		/* Matrix A (trashed in LU decomposition) */
    const int* LDA,	/* Row length of A (at least N) */
    int* IPIV,		/* Space for a permutation, N cells */
    double* B,		/* "Matrix" B (on input) and X (output), NRHS columns */
    const int* LDB,	/* Row length of B and X (at least N) */
    int* INFO);		/* 0 = OK, <0 = invalid arg, >0 = array is singular. */

/* Does one set of linear equations using DGESV */
/* Checking takes o(N^2) while the main computation is o(N^3): negligible. */
int dgesv_test(		/* Returns "info", 0 = OK */
    const int N,	/* Dimension of all the matrices */
    double* stg[],	/* Storage for matrices */
    const char* label	/* Description for error message */
) {
    if (!stg[0]) {
	stg[0] = malloc(sizeof(double)*N*N);	/* Copy of A (trashed) */
	stg[1] = malloc(sizeof(double)*N);	/* Copy of B, output here */
	stg[2] = vecget(N*N);			/* The matrix */
	stg[3] = vecget(N);			/* Vector of equation values */
	stg[4] = malloc(sizeof(double)*N);	/* Check product, should == B */
	stg[5] = malloc(sizeof(int)*N);		/* Space for a permutation */
    }
    double *A  = stg[0];			/* The matrix (trashed) */
    double *B  = stg[1];			/* Copy of B */
    double* A2 = stg[2];			/* Matrix (original) */
    double* B2 = stg[3];			/* Vector of equation values */
    int* ipiv  = (int*)(void*)stg[5];		/* Space for a permutation */

    const int one = 1;
    int info = 0, cmp = 0;
    char label2[80];

    memcpy(A, A2, sizeof(*A)*N*N);
    memcpy(B, B2, sizeof(*B)*N);
    dgesv_(&N, &one, A, &N, ipiv, B, &N, &info);

    if (opt['c']) {
	double* B3 = stg[4];			/* Check product, should == B */
	matxmat(N, 1, 5, B3, A2, B);
		/* Error amplification can occur due to subtracting near-equal
		    values, so relax the error criterion.   Cross fingers. */
	cmp = compare(100* *(double*)(opt['e']), N, B2, B3, label);
	if (opt['V'] || cmp) {
	    snprintf(label2, sizeof(label2), "Matrix (%s)", label);
	    printmat(N, N, A2, label2);
	    snprintf(label2, sizeof(label2), "Result (%s)", label);
	    printmat(N, 1, B, label2);
	    snprintf(label2, sizeof(label2), "Constants (dgesv) (%s)", label);
	    printmat(N, 1, B2, label2);
	    snprintf(label2, sizeof(label2), "Constants (check) (%s)", label);
	    printmat(N, 1, B3, label2);
	}
    }
    if (info || cmp) exit(4);
    return info;
}

    
/* Eigenvalues and eigenvectors 
    A * vr(j) = lambda(j) * vr(j) and the transpose of this for left e-vecs
    Uses the QR algo internally.  */
extern void dgeev_ (
    const char* JOBVL,	/* Left e-vecs: V=wants, N=omit */
    const char* JOBVR,	/* Same for right e-vecs */
    const int* N,	/* Dimension of A, VL, VR */
    double* A,		/* Matrix A (trashed) */
    const int* LDA,	/* Row length of A */
    double* WR,		/* Eigenvalues, real parts */
    double* WI,		/* Eigenvalues, imaginary parts */
    double* VL,		/* Left eigenvectors */
    const int* LDVL,	/* Row length of VL */
    double* VR,		/* Right eigenvectors */
    const int* LDVR,	/* Row length of VR */
    double* WORK,	/* A work area, size LWORK */
    const int* LWORK,	/* Length of WORK, at least 3N,
	or 4N if returning e-vecs, larger gives better performance.  If value
	is -1, then WORK[1] is set to the optimal value of LWORK.  */
    int* INFO);		/* 0 = OK, <0 = invalid argument, >0 = QR algo botched*/

/* Does one eigenvalue solution using DGEEV */
/* Checking takes about 1.3x the basic operation, with stock BLAS. */
int dgeev_test(		/* Returns "info", 0 = OK */
    const int N,	/* Dimension of all the matrices */
    double* stg[],	/* Storage for matrices */
    const char* label	/* Description for error message */
) {
    int i, j;
    int info = 0, cmp = 0;
    int wsz;
    double* initted = stg[0];
    if (!initted) {
	stg[0] = malloc(sizeof(double)*N*N);	/* Matrix to eigenize (trash)*/
	stg[1] = vecget(N*N);			/* Permanent copy of A */
	stg[2] = malloc(sizeof(double)*N*N);	/* Its eigenvectors (packed) */
	stg[3] = malloc(sizeof(double)*2*N*N);	/* Eigenvectors (real, img) */
	stg[4] = malloc(sizeof(double)*2*N);	/* Its eigenvalues */
	stg[5] = malloc(sizeof(double)*2*N*N);	/* Checking A * Bf */
	stg[6] = malloc(sizeof(double)*2*N*N);	/* Checking e-val * Bf */
	wsz = 4*N;				/* Size of work area */
	stg[7] = malloc(sizeof(double)*(wsz+1));/* Work area, realloced later */
	stg[7][0] = (double)wsz;
    }
    double *A = stg[0];		/* Matrix to eigenize (trashed)*/
    double* A2 = stg[1];	/* Original matrix */
    double* B  = stg[2];	/* Its eigenvectors (packed) */
    double* E  = stg[4];	/* Its eigenvalues (real, imag)*/
    double* W  = stg[7]; 	/* Work area, realloced later */
    wsz = (int)W[0];		/* Size of work area */
		/* Realloc the work area to the optimal size */
    if (!initted) {
	wsz = -1;
	dgeev_("N", "V", &N, A, &N, E, E+N, NULL, &N, B, &N, W, &wsz, &info);
	wsz = (int)W[0];
	W = stg[7] = realloc(W, sizeof(double)*(wsz+1));
	W[0] = (double)wsz;
    }
    double t;
    char label2[80];

    memcpy(A, A2, sizeof(*A)*N*N);
    dgeev_("N", "V", &N, A, &N, E, E+N, NULL, &N, B, &N, W+1, &wsz, &info);

    if (opt['c']) {
	double* Bf = stg[3];	/* Its eigenvectors (real, imag) */
	double* AB = stg[5];	/* Checking A * Bf */
	double* LB = stg[6];	/* Checking e-val * Bf */
	    /* Expand the eigenvectors in full, NxN real parts, then imag.
	        Each "column" (contiguous set of N elts) is an eigenvector. */
	for (i = 0; i < N; ) {
	    if (E[N+i] == 0.0) {	/* A really real eigenvalue (rare) */
		memcpy(Bf + N*i, B + N*i, sizeof(double)*N);
		memset(Bf + N*(i+N), 0, sizeof(double)*N);
		++i;
	    } else {
			/* real(B[i]) to Bf[i] (real) */
		memcpy(Bf + N*i, B + N*i, sizeof(double)*N);
			/* real(B[i]) to Bf[i+1] (real) */
		memcpy(Bf + N*(i+1), B + N*i, sizeof(double)*N);
			/* imag(B[i]) to Bf[i] (imag) */
		memcpy(Bf + N*(i+N), B + N*(i+1), sizeof(double)*N);
			/* -imag(B[i]) to Bf[i+1] (imag) (sign reversed) */
		for (j = 0; j < N; ++j) {
		    Bf[j + (i+1+N)*N] = -B[j + (i+1)*N];
		}
		i += 2;
	    }
	}
			/* AB = A * Bf, same layout as Bf.  A is pure real. */
	matxmat(N, N, 7, AB, A2, Bf);		/* Real parts first */
	matxmat(N, N, 7, AB+N*N, A2, Bf+N*N);	/* Imaginary parts */
			/* LB = eigenvalues * Bf; everything is complex. */
	for (i = 0; i < N; ++i) {
	    for (j = 0; j < N; ++j) {
		LB[i*N + j] = E[i]*Bf[i*N + j] - E[i+N]*Bf[(i+N)*N + j];
		LB[(i+N)*N + j] = E[i]*Bf[(i+N)*N + j] + E[i+N]*Bf[i*N + j];
	    }
	}
	cmp = compare(*(double*)(opt['e']), 2*N*N, AB, LB, label);
	if (opt['V'] || cmp) {
	    snprintf(label2, sizeof(label2), "Matrix (%s)", label);
	    printmat(N, N, A2, label2);
	    snprintf(label2, sizeof(label2), "Eigenvectors (packed) (%s)", label);
	    printmat(N, N, B, label2);
	    snprintf(label2, sizeof(label2), "Eigenvectors (R/I) (%s)", label);
	    printmat(N, 2*N, Bf, label2);
	    snprintf(label2, sizeof(label2), "Eigenvalues (R/I) (%s)", label);
	    printmat(N, 2, E, label2);
	    snprintf(label2, sizeof(label2), "Check EV (A*EV) (%s)", label);
	    printmat(N, 2*N, AB, label2);
	    snprintf(label2, sizeof(label2), "Check EV (l*EV) (%s)", label);
	    printmat(N, 2*N, LB, label2);
	}
    }
    if (info || cmp) exit(4);
    return info;
}

/* Repeats one of the tests for a specified time.  */
#define MAXSTG 8	/* Number of slots in the operand array. */
double repeat(		/* Returns the run time per matrix */
    double secs,	/* How long to run in seconds */
    const int N,	/* Dimension of the matrices */
    int (*testpgm)(int, double**, const char*),	/* Test program (dimension, matrices) */
    int* niter,		/* Number of repetitions */
    const char* label	/* Description of the test, no ending newline */
) {
    double* stg[MAXSTG] = {NULL, };		/* Operand storage */
    double t, secper;
    int i, j, info;
    char labelNN[80];

    *niter = 0;					/* Number of trials done */
    snprintf(labelNN, sizeof(labelNN), "%s (%dx%d)", label, N, N);
    
    struct timeval start, now, prev;
    gettimeofday(&start, NULL);
    memcpy(&now, &start, sizeof(now));
    for (;;) {
	if (--*(int*)(opt['m']) < 0) {
	    printf("Execution cut off due to -m.\n");
	    exit(0);
	}
	++*niter;
	info = (*testpgm)(N, stg, labelNN);
	if (info) printf("Error code %d from %s\n", info, labelNN);
	if (opt['W']) {
	    printf("Execution terminated due to -W\n");
	    exit(4);
	}
	memcpy(&prev, &now, sizeof(prev));
	gettimeofday(&now, NULL);
	secper = timediff(&now, &prev);
	t = timediff(&now, &start);
	if (t >= 0.7*secs) break;
    }
    for (j = MAXSTG; --j >= 0; ) {
	if (stg[j]) free (stg[j]);
    }
    return (t / (double)(*niter));
}

/* Does the test in two steps, a trial run and then one big problem.  
    Let N = matrix dimension and spf = seconds per flop, then each problem
    should take spf * flopx * N^3 (seconds).  */
double dotest(				/* Returns Sec/N^3 */
    double secs,			/* How long the test should run */
    int (*testpgm)(int, double**, const char*), /* Test function */
    const char* label			/* Description for the result report */
) {
    printf("\n%s (starting)\n", label);
    int N = *(int*)opt['N'];		/* Matrix dimension for trial run */
    int niter = 999;			/* Number of iterations */
    int done = 0;
    double secper;			/* Time per matrix */
    double spf;				/* Seconds per flop */
    double dt = 0.02*secs;		/* How long to expend on this N */
    for (;;) {
	secper = repeat(dt, N, testpgm, &niter, label);
	spf = secper/((double)N*(double)N*(double)N);
	secs -= niter*secper;
	if (opt['v']) 
	    printf("%s, N=%2d, %10.3e sec/N^3, %10.3e sec/mtx, %d reps\n", 
		(done ? "Final" : "Trial"), N, spf, secper, niter);
	if (done) break;
	if (niter <= 1) {
	    dt = secs;
	    done = 1;
	}
		/* All 3 problems scale as N^3.  Find N so one problem
		will use the whole remaining time. 
		(N^3 * secs/secper)^(1/3) = big matrix size */
	N = (int)(((double)N) * pow((dt/secper), 1.0/3));
    }
    if (!opt['v'])
	printf("    Final: N = %d  %10.3e sec/N^3  %10.3e sec/matrix\n",
	    N, spf, secper);
    return spf;
}

/* Main program */
int main(
    int argn,
    char* argv[]
) {
    static int (*testpgm[3])(int, double**, const char*) = 
	{ &dgemm_test, &dgesv_test, &dgeev_test };
    static const char* name[3] = { "dgemm", "dgesv", "dgeev" };
    static const char* label[3] = {
	"Matrix multiply with DGEMM",
	"Linear equations with DGESV",
	"Eigenvalues and vectors with DGEEV" };
		/* For each test, divide this by Sec/N^3 and you get an
		    equivalent speed in bytes/sec, that is within 20% of the
		    speed on the SHA512 algorithm (integers) for half of the
		    test machines.  Outliers: 1/3x, 1/2x, 5x. */
    static const double work[3] = { 0.1, 0.033, 0.82 };
    double spf, score = 0.0;
    int j;

    jgetopt(argn, argv);
    long int seed = *((int*)(opt['I']));
    srand48(seed);
    drand48();				/* Time isn't that random, stir once */
    const char* q;
    int ntest = 1;			/* Number of tests requested */
    for (q = (const char*)opt['s']; *q; ++q) {
	if (*q == ',') ++ntest;
    }
    double secs = *(double*)(opt['t'])/ntest;
    ntest = 0;				/* Count tests ACTUALLY done */
    for (j = 0; j < (sizeof(name)/sizeof(*name)); ++j) {
	if (strstr(opt['s'], name[j])) {
	    ntest++;
	    spf = dotest(secs, testpgm[j], label[j]);
	    score += work[j]/spf;
	}
    }
    printf("Overall: %d kbytes/sec\n", (int)(1.0e-3*score/ntest));
    return 0;
}
    
