Added benchmark C files.

38c72a01 · Jayke Meijer · ab223369 · 38c72a01 · 38c72a01 · 38c72a01
Commit 38c72a01 authored Nov 04, 2011 by Jayke Meijer
9 changed files
--- a/benchmarks/Makefile
+++ b/benchmarks/Makefile
+CC=xgcc
+CFLAGS=-O0
+%.s: %.c
+	$(CC) $(CFLAGS) -S $<
+%.o: %.s
+	$(CC) $(CFLAGS) -c $<
+%: %.o
+	$(CC) $(CFLAGS) $<
+all: acron clinpack dhrystone pi slalom whet
+asm: acron.s clinpack.s dhrystone.s pi.s slalom.s whet.s
+clean:
+	rm -f acron clinpack dhrystone pi slalom whet
+	rm -f *.o
+	rm -f *.s
+	rm -f *.cycles *.output
--- a/benchmarks/acron.c
+++ b/benchmarks/acron.c
+#include <stdio.h>
+#define N	6
+char *w[] = {"Vertalerbouw", "Ertalerbouw", "Practicum", "Optimization", "Peephole", "Eephole"};
+char acron[N*2], command[100];
+int  done[N], pindex[N+1];
+int is_vowel(char c)
+{
+   return (c==65 || c==69 || c==73 || c==79 || c==85 || c==89)? 1 : 0;
+}
+void do_perm(int n, int done[], int index, int size)
+{
+   int j, i, nrv = 0, k;
+   if (index == 1 && (!is_vowel(w[pindex[0]][0]) && !is_vowel(w[n][0])))
+       return;
+   if (index > 1) {
+      nrv = is_vowel(w[pindex[index-2]][0]) +
+            is_vowel(w[pindex[index-1]][0]) +
+            is_vowel(w[n][0]);
+      if (nrv == 0 || nrv == 3)
+       return;
+   }
+   pindex[index++] = n;
+   if (index < N && --size) {
+      for (j = 0; j<N; j++) {
+         if (done[j] == 0) {
+            done[j] = 1;
+            do_perm(j, done, index, size);
+            done[j] = 0;
+         }
+      }
+   } else {
+      k = 0;
+      for (i=0; i < index; i++) {
+	 int t = 0;
+	 while (isupper(w[pindex[i]][t]))
+	    acron[k++] = w[pindex[i]][t++];
+      }
+      acron[k] = 0;
+      printf("%s", acron);
+      for (i=0; i < index; i++) 
+	    printf(" %s", w[pindex[i]]);
+      printf("\n");
+/*      fflush(stdout); */
+   }
+}
+int main()
+{
+   int i, j;
+   for (j = 4; j <= N; j++) {
+      for (i = 0; i < N; i++) {
+        done[i] = 1;
+        do_perm(i, done, 0, j);
+        done[i] = 0;
+      }
+   }
+}
--- a/benchmarks/clinpack.c
+++ b/benchmarks/clinpack.c
+/*
+Translated to C by Bonnie Toy 5/88
+You MUST specify one of -DSP   or -DDP     to compile correctly.
+You MUST specify one of -DROLL or -DUNROLL to compile correctly.
+You MUST specify a timer option(see below) to compile correctly.
+To compile double precision version for Sun-4:
+   cc -DUNIX -DDP -DROLL -O4 clinpack.c
+To compile single precision version for Sun-4:
+   cc -DUNIX -DSP -DROLL -O4 -fsingle -fsingle2 clinpack.c
+To obtain   rolled source BLAS, add -DROLL   to the command lines.
+To obtain unrolled source BLAS, add -DUNROLL to the command lines.
+PLEASE NOTE: You can also just 'uncomment' one of the options below.
+*/
+/* #define SP     */
+#define DP    
+/*#define ROLL  */
+#define UNROLL 
+/***************************************************************/
+/* Timer options. You MUST uncomment one of the options below  */
+/* or compile, for example, with the '-DUNIX' option.          */
+/***************************************************************/
+/* #define Amiga       */
+#define UNIX        
+/* #define UNIX_Old    */
+/* #define VMS         */
+/* #define BORLAND_C   */
+/* #define MSC         */
+/* #define MAC         */
+/* #define IPSC        */
+/* #define FORTRAN_SEC */
+/* #define GTODay      */
+/* #define CTimer      */
+/* #define UXPM        */
+#include <stdio.h>
+#include <math.h>
+#ifdef SP
+#define REAL float
+#define ZERO 0.0
+#define ONE  1.0
+#define PREC "Single "
+#endif
+#ifdef DP
+#define REAL double
+#define ZERO 0.0e0
+#define ONE  1.0e0
+#define PREC "Double "
+#endif
+#define NTIMES 1
+#ifdef ROLL
+#define ROLLING "Rolled "
+#endif
+#ifdef UNROLL
+#define ROLLING "Unrolled "
+#endif
+static double st[8][6];
+main ()
+{
+   static REAL aa[200][200],a[200][201],b[200],x[200];
+   REAL cray,ops,total,norma,normx;
+   REAL resid,residn,eps;
+   REAL epslon(),kf;
+   double t1,tm,tm2,dtime();
+   static int ipvt[200],n,i,ntimes,info,lda,ldaa,kflops;
+   lda = 201;
+   ldaa = 200;
+   cray = .056; 
+   n = 25;
+   printf(ROLLING); printf(PREC);
+   printf("Precision Linpack\n\n");
+	ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n);
+	matgen(a,lda,n,b,&norma);
+	t1 = dtime();
+	dgefa(a,lda,n,ipvt,&info);
+	st[0][0] = dtime() - t1;
+	t1 = dtime();
+	dgesl(a,lda,n,ipvt,b,0);
+	st[1][0] = dtime() - t1;
+	total = st[0][0] + st[1][0];
+/*     compute a residual to verify results.  */ 
+	for (i = 0; i < n; i++)
+	   {
+	       x[i] = b[i];
+	   }
+	matgen(a,lda,n,b,&norma);
+	for (i = 0; i < n; i++) 
+	   {
+	       b[i] = -b[i];
+	   }
+	dmxpy(n,b,n,lda,x,a);
+	resid = 0.0;
+	normx = 0.0;
+	for (i = 0; i < n; i++)
+	 {
+	       resid = (resid > fabs((double)b[i])) 
+	 ? resid : fabs((double)b[i]);
+	       normx = (normx > fabs((double)x[i])) 
+	 ? normx : fabs((double)x[i]);
+	 }
+	eps = epslon((REAL)ONE);
+	residn = resid/( n*norma*normx*eps );
+   printf("   norm. resid      resid           machep");
+   printf("         x[0]-1        x[n-1]-1\n");
+   printf("%8.1f      %16.8e%16.8e%16.8e%16.8e\n",
+	  (double)residn, (double)resid, (double)eps, 
+	       (double)x[0]-1, (double)x[n-1]-1);
+printf(" times are reported for matrices of order %5d\n",n);
+printf("      dgefa      dgesl      total       kflops     unit");
+printf("      ratio\n");
+	st[2][0] = total;
+	st[3][0] = ops/(1.0e3*total);
+	st[4][0] = 2.0e3/st[3][0];
+	st[5][0] = total/cray;
+   printf(" times for array with leading dimension of%5d\n",lda);
+   print_time(0);
+	matgen(a,lda,n,b,&norma);
+	t1 = dtime();
+	dgefa(a,lda,n,ipvt,&info);
+	st[0][1] = dtime() - t1;
+	t1 = dtime();
+	dgesl(a,lda,n,ipvt,b,0);
+	st[1][1] = dtime() - t1;
+	total = st[0][1] + st[1][1];
+	st[2][1] = total;
+	st[3][1] = ops/(1.0e3*total);
+	st[4][1] = 2.0e3/st[3][1];
+	st[5][1] = total/cray;
+	matgen(a,lda,n,b,&norma);
+	t1 = dtime();
+	dgefa(a,lda,n,ipvt,&info);
+	st[0][2] = dtime() - t1;
+	t1 = dtime();
+	dgesl(a,lda,n,ipvt,b,0);
+	st[1][2] = dtime() - t1;
+	total = st[0][2] + st[1][2];
+	st[2][2] = total;
+	st[3][2] = ops/(1.0e3*total);
+	st[4][2] = 2.0e3/st[3][2];
+	st[5][2] = total/cray;
+	ntimes = NTIMES;
+	tm2 = 0.0;
+	t1 = dtime();
+   for (i = 0; i < ntimes; i++) {
+	       tm = dtime();
+      matgen(a,lda,n,b,&norma);
+      tm2 = tm2 + dtime() - tm;
+      dgefa(a,lda,n,ipvt,&info);
+      }
+	st[0][3] = (dtime() - t1 - tm2)/ntimes;
+	t1 = dtime();
+   for (i = 0; i < ntimes; i++) {
+	       dgesl(a,lda,n,ipvt,b,0);
+      }
+	st[1][3] = (dtime() - t1)/ntimes;
+	total = st[0][3] + st[1][3];
+	st[2][3] = total;
+	st[3][3] = ops/(1.0e3*total);
+	st[4][3] = 2.0e3/st[3][3];
+	st[5][3] = total/cray;
+   print_time(1);
+   print_time(2);
+   print_time(3);
+	matgen(aa,ldaa,n,b,&norma);
+	t1 = dtime();
+	dgefa(aa,ldaa,n,ipvt,&info);
+	st[0][4] = dtime() - t1;
+	t1 = dtime();
+	dgesl(aa,ldaa,n,ipvt,b,0);
+	st[1][4] = dtime() - t1;
+	total = st[0][4] + st[1][4];
+	st[2][4] = total;
+	st[3][4] = ops/(1.0e3*total);
+	st[4][4] = 2.0e3/st[3][4];
+	st[5][4] = total/cray;
+	matgen(aa,ldaa,n,b,&norma);
+	t1 = dtime();
+	dgefa(aa,ldaa,n,ipvt,&info);
+	st[0][5] = dtime() - t1;
+	t1 = dtime();
+	dgesl(aa,ldaa,n,ipvt,b,0);
+	st[1][5] = dtime() - t1;
+	total = st[0][5] + st[1][5];
+	st[2][5] = total;
+	st[3][5] = ops/(1.0e3*total);
+	st[4][5] = 2.0e3/st[3][5];
+	st[5][5] = total/cray;
+   matgen(aa,ldaa,n,b,&norma);
+   t1 = dtime();
+   dgefa(aa,ldaa,n,ipvt,&info);
+   st[0][6] = dtime() - t1;
+   t1 = dtime();
+   dgesl(aa,ldaa,n,ipvt,b,0);
+   st[1][6] = dtime() - t1;
+   total = st[0][6] + st[1][6];
+   st[2][6] = total;
+   st[3][6] = ops/(1.0e3*total);
+   st[4][6] = 2.0e3/st[3][6];
+   st[5][6] = total/cray;
+   ntimes = NTIMES;
+   tm2 = 0;
+   t1 = dtime();
+   for (i = 0; i < ntimes; i++) {
+      tm = dtime();
+      matgen(aa,ldaa,n,b,&norma);
+      tm2 = tm2 + dtime() - tm;
+      dgefa(aa,ldaa,n,ipvt,&info);
+      }
+   st[0][7] = (dtime() - t1 - tm2)/ntimes;
+   t1 = dtime();
+   for (i = 0; i < ntimes; i++) {
+      dgesl(aa,ldaa,n,ipvt,b,0);
+      }
+   st[1][7] = (dtime() - t1)/ntimes;
+   total = st[0][7] + st[1][7];
+   st[2][7] = total;
+   st[3][7] = ops/(1.0e3*total);
+   st[4][7] = 2.0e3/st[3][7];
+   st[5][7] = total/cray;
+   /* the following code sequence implements the semantics of
+      the Fortran intrinsics "nint(min(st[3][3],st[3][7]))"   */
+/*
+   kf = (st[3][3] < st[3][7]) ? st[3][3] : st[3][7];
+   kf = (kf > ZERO) ? (kf + .5) : (kf - .5);
+   if (fabs((double)kf) < ONE) 
+      kflops = 0;
+   else {
+      kflops = floor(fabs((double)kf));
+      if (kf < ZERO) kflops = -kflops;
+   }
+*/
+   if ( st[3][3] < ZERO ) st[3][3] = ZERO;
+   if ( st[3][7] < ZERO ) st[3][7] = ZERO;
+   kf = st[3][3];
+   if ( st[3][7] < st[3][3] ) kf = st[3][7];
+   kflops = (int)(kf + 0.5);
+   printf(" times for array with leading dimension of%4d\n",ldaa);
+   print_time(4);
+   print_time(5);
+   print_time(6);
+   print_time(7);
+   printf(ROLLING); printf(PREC);
+   printf(" Precision %5d Kflops ; %d Reps \n",kflops,NTIMES);
+}
+/*----------------------*/ 
+print_time (row)
+int row;
+{
+printf("%11.2f%11.2f%11.2f%11.0f%11.2f%11.2f\n",
+       (double)st[0][row], (double)st[1][row], (double)st[2][row], 
+       (double)st[3][row], (double)st[4][row], (double)st[5][row]);
+}
+/*----------------------*/ 
+matgen(a,lda,n,b,norma)
+REAL a[],b[],*norma;
+int lda, n;
+/* We would like to declare a[][lda], but c does not allow it.  In this
+function, references to a[i][j] are written a[lda*i+j].  */
+{
+   int init, i, j;
+   init = 1325;
+   *norma = 0.0;
+   for (j = 0; j < n; j++) {
+      for (i = 0; i < n; i++) {
+	 init = 3125*init % 65536;
+	 a[lda*j+i] = (init - 32768.0)/16384.0;
+	 *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma;
+      }
+   }
+   for (i = 0; i < n; i++) {
+	  b[i] = 0.0;
+   }
+   for (j = 0; j < n; j++) {
+      for (i = 0; i < n; i++) {
+	 b[i] = b[i] + a[lda*j+i];
+      }
+   }
+}
+/*----------------------*/ 
+dgefa(a,lda,n,ipvt,info)
+REAL a[];
+int lda,n,ipvt[],*info;
+/* We would like to declare a[][lda], but c does not allow it.  In this
+function, references to a[i][j] are written a[lda*i+j].  
+*/
+/*
+     dgefa factors a double precision matrix by gaussian elimination.
+     dgefa is usually called by dgeco, but it can be called
+     directly with a saving in time if  rcond  is not needed.
+     (time for dgeco) = (1 + 9/n)*(time for dgefa) .
+     on entry
+	a       REAL precision[n][lda]
+		the matrix to be factored.
+	lda     integer
+		the leading dimension of the array  a .
+	n       integer
+		the order of the matrix  a .
+     on return
+	a       an upper triangular matrix and the multipliers
+		which were used to obtain it.
+		the factorization can be written  a = l*u  where
+		l  is a product of permutation and unit lower
+		triangular matrices and  u  is upper triangular.
+	ipvt    integer[n]
+		an integer vector of pivot indices.
+	info    integer
+		= 0  normal value.
+		= k  if  u[k][k] .eq. 0.0 .  this is not an error
+		     condition for this subroutine, but it does
+		     indicate that dgesl or dgedi will divide by zero
+		     if called.  use  rcond  in dgeco for a reliable
+		     indication of singularity.
+     linpack. this version dated 08/14/78 .
+     cleve moler, university of new mexico, argonne national lab.
+     functions
+     blas daxpy,dscal,idamax
+*/
+{
+/*     internal variables   */
+REAL t;
+int idamax(),j,k,kp1,l,nm1;
+/*     gaussian elimination with partial pivoting   */
+   *info = 0;
+   nm1 = n - 1;
+   if (nm1 >=  0) {
+      for (k = 0; k < nm1; k++) {
+	 kp1 = k + 1;
+		/* find l = pivot index   */
+	 l = idamax(n-k,&a[lda*k+k],1) + k;
+	 ipvt[k] = l;
+	 /* zero pivot implies this column already 
+	    triangularized */
+	 if (a[lda*k+l] != ZERO) {
+	    /* interchange if necessary */
+	    if (l != k) {
+	       t = a[lda*k+l];
+	       a[lda*k+l] = a[lda*k+k];
+	       a[lda*k+k] = t; 
+	    }
+	    /* compute multipliers */
+	    t = -ONE/a[lda*k+k];
+	    dscal(n-(k+1),t,&a[lda*k+k+1],1);
+	    /* row elimination with column indexing */
+	    for (j = kp1; j < n; j++) {
+	       t = a[lda*j+l];
+	       if (l != k) {
+		  a[lda*j+l] = a[lda*j+k];
+		  a[lda*j+k] = t;
+	       }
+	       daxpy(n-(k+1),t,&a[lda*k+k+1],1,
+		     &a[lda*j+k+1],1);
+	      } 
+	   }
+	 else { 
+		     *info = k;
+	 }
+      } 
+   }
+   ipvt[n-1] = n-1;
+   if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1;
+}
+/*----------------------*/ 
+dgesl(a,lda,n,ipvt,b,job)
+int lda,n,ipvt[],job;
+REAL a[],b[];
+/* We would like to declare a[][lda], but c does not allow it.  In this
+function, references to a[i][j] are written a[lda*i+j].  */
+/*
+     dgesl solves the double precision system
+     a * x = b  or  trans(a) * x = b
+     using the factors computed by dgeco or dgefa.
+     on entry
+	a       double precision[n][lda]
+		the output from dgeco or dgefa.
+	lda     integer
+		the leading dimension of the array  a .
+	n       integer
+		the order of the matrix  a .
+	ipvt    integer[n]
+		the pivot vector from dgeco or dgefa.
+	b       double precision[n]
+		the right hand side vector.
+	job     integer
+		= 0         to solve  a*x = b ,
+		= nonzero   to solve  trans(a)*x = b  where
+			    trans(a)  is the transpose.
+    on return
+	b       the solution vector  x .
+     error condition
+	a division by zero will occur if the input factor contains a
+	zero on the diagonal.  technically this indicates singularity
+	but it is often caused by improper arguments or improper
+	setting of lda .  it will not occur if the subroutines are
+	called correctly and if dgeco has set rcond .gt. 0.0
+	or dgefa has set info .eq. 0 .
+     to compute  inverse(a) * c  where  c  is a matrix
+     with  p  columns
+	   dgeco(a,lda,n,ipvt,rcond,z)
+	   if (!rcond is too small){
+	      for (j=0,j<p,j++)
+		    dgesl(a,lda,n,ipvt,c[j][0],0);
+      }
+     linpack. this version dated 08/14/78 .
+     cleve moler, university of new mexico, argonne national lab.
+     functions
+     blas daxpy,ddot
+*/
+{
+/*     internal variables   */
+   REAL ddot(),t;
+   int k,kb,l,nm1;
+   nm1 = n - 1;
+   if (job == 0) {
+      /* job = 0 , solve  a * x = b
+	 first solve  l*y = b       */
+      if (nm1 >= 1) {
+	 for (k = 0; k < nm1; k++) {
+	    l = ipvt[k];
+	    t = b[l];
+	    if (l != k){ 
+	       b[l] = b[k];
+	       b[k] = t;
+	    }   
+	    daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1);
+	 }
+      } 
+      /* now solve  u*x = y */
+      for (kb = 0; kb < n; kb++) {
+	  k = n - (kb + 1);
+	  b[k] = b[k]/a[lda*k+k];
+	  t = -b[k];
+	  daxpy(k,t,&a[lda*k+0],1,&b[0],1);
+      }
+   }
+   else { 
+      /* job = nonzero, solve  trans(a) * x = b
+	 first solve  trans(u)*y = b          */
+      for (k = 0; k < n; k++) {
+	 t = ddot(k,&a[lda*k+0],1,&b[0],1);
+	 b[k] = (b[k] - t)/a[lda*k+k];
+      }
+      /* now solve trans(l)*x = y   */
+      if (nm1 >= 1) {
+	 for (kb = 1; kb < nm1; kb++) {
+	    k = n - (kb+1);
+	    b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1);
+	    l = ipvt[k];
+	    if (l != k) {
+	       t = b[l];
+	       b[l] = b[k];
+	       b[k] = t;
+	    }
+	 }
+      }
+   }
+}
+/*----------------------*/ 
+daxpy(n,da,dx,incx,dy,incy)
+/*
+     constant times a vector plus a vector.
+     jack dongarra, linpack, 3/11/78.
+*/
+REAL dx[],dy[],da;
+int incx,incy,n;
+{
+   int i,ix,iy,m,mp1;
+   if(n <= 0) return;
+   if (da == ZERO) return;
+   if(incx != 1 || incy != 1) {
+      /* code for unequal increments or equal increments
+	 not equal to 1                */
+      ix = 1;
+      iy = 1;
+      if(incx < 0) ix = (-n+1)*incx + 1;
+      if(incy < 0) iy = (-n+1)*incy + 1;
+      for (i = 0;i < n; i++) {
+	 dy[iy] = dy[iy] + da*dx[ix];
+	 ix = ix + incx;
+	 iy = iy + incy;
+      }
+	    return;
+   }
+   /* code for both increments equal to 1 */
+#ifdef ROLL
+   for (i = 0;i < n; i++) {
+      dy[i] = dy[i] + da*dx[i];
+   }
+#endif
+#ifdef UNROLL
+   m = n % 4;
+   if ( m != 0) {
+      for (i = 0; i < m; i++) 
+	 dy[i] = dy[i] + da*dx[i];
+      if (n < 4) return;
+   }
+   for (i = m; i < n; i = i + 4) {
+      dy[i]   = dy[i]   + da*dx[i];
+      dy[i+1] = dy[i+1] + da*dx[i+1];
+      dy[i+2] = dy[i+2] + da*dx[i+2];
+      dy[i+3] = dy[i+3] + da*dx[i+3];
+   }
+#endif
+}
+/*----------------------*/ 
+REAL ddot(n,dx,incx,dy,incy)
+/*
+     forms the dot product of two vectors.
+     jack dongarra, linpack, 3/11/78.
+*/
+REAL dx[],dy[];
+int incx,incy,n;
+{
+   REAL dtemp;
+   int i,ix,iy,m,mp1;
+   dtemp = ZERO;
+   if(n <= 0) return(ZERO);
+   if(incx != 1 || incy != 1) {
+      /* code for unequal increments or equal increments
+	 not equal to 1               */
+      ix = 0;
+      iy = 0;
+      if (incx < 0) ix = (-n+1)*incx;
+      if (incy < 0) iy = (-n+1)*incy;
+      for (i = 0;i < n; i++) {
+	 dtemp = dtemp + dx[ix]*dy[iy];
+	 ix = ix + incx;
+	 iy = iy + incy;
+      }
+      return(dtemp);
+   }
+   /* code for both increments equal to 1 */
+#ifdef ROLL
+   for (i=0;i < n; i++)
+      dtemp = dtemp + dx[i]*dy[i];
+   return(dtemp);
+#endif
+#ifdef UNROLL
+   m = n % 5;
+   if (m != 0) {
+      for (i = 0; i < m; i++)
+	 dtemp = dtemp + dx[i]*dy[i];
+      if (n < 5) return(dtemp);
+   }
+   for (i = m; i < n; i = i + 5) {
+      dtemp = dtemp + dx[i]*dy[i] +
+      dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] +
+      dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4];
+   }
+   return(dtemp);
+#endif
+}
+/*----------------------*/ 
+dscal(n,da,dx,incx)
+/*     scales a vector by a constant.
+      jack dongarra, linpack, 3/11/78.
+*/
+REAL da,dx[];
+int n, incx;
+{
+   int i,m,mp1,nincx;
+   if(n <= 0)return;
+   if(incx != 1) {
+      /* code for increment not equal to 1 */
+      nincx = n*incx;
+      for (i = 0; i < nincx; i = i + incx)
+	 dx[i] = da*dx[i];
+      return;
+   }
+   /* code for increment equal to 1 */
+#ifdef ROLL
+   for (i = 0; i < n; i++)
+      dx[i] = da*dx[i];
+#endif
+#ifdef UNROLL
+   m = n % 5;
+   if (m != 0) {
+      for (i = 0; i < m; i++)
+	 dx[i] = da*dx[i];
+      if (n < 5) return;
+   }
+   for (i = m; i < n; i = i + 5){
+      dx[i] = da*dx[i];
+      dx[i+1] = da*dx[i+1];
+      dx[i+2] = da*dx[i+2];
+      dx[i+3] = da*dx[i+3];
+      dx[i+4] = da*dx[i+4];
+   }
+#endif
+}
+/*----------------------*/ 
+int idamax(n,dx,incx)
+/*
+     finds the index of element having max. absolute value.
+     jack dongarra, linpack, 3/11/78.
+*/
+REAL dx[];
+int incx,n;
+{
+   REAL dmax;
+   int i, ix, itemp;
+   if( n < 1 ) return(-1);
+   if(n ==1 ) return(0);
+   if(incx != 1) {
+      /* code for increment not equal to 1 */
+      ix = 1;
+      dmax = fabs((double)dx[0]);
+      ix = ix + incx;
+      for (i = 1; i < n; i++) {
+	 if(fabs((double)dx[ix]) > dmax)  {
+	    itemp = i;
+	    dmax = fabs((double)dx[ix]);
+	 }
+	 ix = ix + incx;
+      }
+   }
+   else {
+      /* code for increment equal to 1 */
+      itemp = 0;
+      dmax = fabs((double)dx[0]);
+      for (i = 1; i < n; i++) {
+	 if(fabs((double)dx[i]) > dmax) {
+	    itemp = i;
+	    dmax = fabs((double)dx[i]);
+	 }
+      }
+   }
+   return (itemp);
+}
+/*----------------------*/ 
+REAL epslon (x)
+REAL x;
+/*
+     estimate unit roundoff in quantities of size x.
+*/
+{
+   REAL a,b,c,eps;
+/*
+     this program should function properly on all systems
+     satisfying the following two assumptions,
+	1.  the base used in representing dfloating point
+	    numbers is not a power of three.
+	2.  the quantity  a  in statement 10 is represented to 
+	    the accuracy used in dfloating point variables
+	    that are stored in memory.
+     the statement number 10 and the go to 10 are intended to
+     force optimizing compilers to generate code satisfying 
+     assumption 2.
+     under these assumptions, it should be true that,
+	    a  is not exactly equal to four-thirds,
+	    b  has a zero for its last bit or digit,
+	    c  is not exactly equal to one,
+	    eps  measures the separation of 1.0 from
+		 the next larger dfloating point number.
+     the developers of eispack would appreciate being informed
+     about any systems where these assumptions do not hold.
+     *****************************************************************
+     this routine is one of the auxiliary routines used by eispack iii
+     to avoid machine dependencies.
+     *****************************************************************
+     this version dated 4/6/83.
+*/
+   a = 4.0e0/3.0e0;
+   eps = ZERO;
+   while (eps == ZERO) {
+      b = a - ONE;
+      c = b + b + b;
+      eps = fabs((double)(c-ONE));
+   }
+   return(eps*fabs((double)x));
+}
+/*----------------------*/ 
+dmxpy (n1, y, n2, ldm, x, m)
+REAL y[], x[], m[];
+int n1, n2, ldm;
+/* We would like to declare m[][ldm], but c does not allow it.  In this
+function, references to m[i][j] are written m[ldm*i+j].  */
+/*
+   purpose:
+     multiply matrix m times vector x and add the result to vector y.
+   parameters:
+     n1 integer, number of elements in vector y, and number of rows in
+	 matrix m
+     y double [n1], vector of length n1 to which is added 
+	 the product m*x
+     n2 integer, number of elements in vector x, and number of columns
+	 in matrix m
+     ldm integer, leading dimension of array m
+     x double [n2], vector of length n2
+     m double [ldm][n2], matrix of n1 rows and n2 columns
+ ----------------------------------------------------------------------
+*/
+{
+   int j,i,jmin;
+   /* cleanup odd vector */
+   j = n2 % 2;
+   if (j >= 1) {
+      j = j - 1;
+      for (i = 0; i < n1; i++) 
+		  y[i] = (y[i]) + x[j]*m[ldm*j+i];
+   } 
+   /* cleanup odd group of two vectors */
+   j = n2 % 4;
+   if (j >= 2) {
+      j = j - 1;
+      for (i = 0; i < n1; i++)
+		  y[i] = ( (y[i])
+			    + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
+   } 
+   /* cleanup odd group of four vectors */
+   j = n2 % 8;
+   if (j >= 4) {
+      j = j - 1;
+      for (i = 0; i < n1; i++)
+	 y[i] = ((( (y[i])
+		+ x[j-3]*m[ldm*(j-3)+i]) 
+		+ x[j-2]*m[ldm*(j-2)+i])
+		+ x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
+   } 
+   /* cleanup odd group of eight vectors */
+   j = n2 % 16;
+   if (j >= 8) {
+      j = j - 1;
+      for (i = 0; i < n1; i++)
+	 y[i] = ((((((( (y[i])
+		+ x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i])
+		  + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i])
+		+ x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i])
+		+ x[j-1]*m[ldm*(j-1)+i]) + x[j]  *m[ldm*j+i];
+   } 
+   /* main loop - groups of sixteen vectors */
+   jmin = (n2%16)+16;
+   for (j = jmin-1; j < n2; j = j + 16) {
+      for (i = 0; i < n1; i++) 
+	 y[i] = ((((((((((((((( (y[i])
+		   + x[j-15]*m[ldm*(j-15)+i]) 
+	    + x[j-14]*m[ldm*(j-14)+i])
+		 + x[j-13]*m[ldm*(j-13)+i]) 
+	    + x[j-12]*m[ldm*(j-12)+i])
+		 + x[j-11]*m[ldm*(j-11)+i]) 
+	    + x[j-10]*m[ldm*(j-10)+i])
+		 + x[j- 9]*m[ldm*(j- 9)+i]) 
+	    + x[j- 8]*m[ldm*(j- 8)+i])
+		 + x[j- 7]*m[ldm*(j- 7)+i]) 
+	    + x[j- 6]*m[ldm*(j- 6)+i])
+		 + x[j- 5]*m[ldm*(j- 5)+i]) 
+	    + x[j- 4]*m[ldm*(j- 4)+i])
+		 + x[j- 3]*m[ldm*(j- 3)+i]) 
+	    + x[j- 2]*m[ldm*(j- 2)+i])
+		 + x[j- 1]*m[ldm*(j- 1)+i]) 
+	    + x[j]   *m[ldm*j+i];
+   }
+} 
+/*****************************************************/
+/* Various timer routines.                           */
+/* Al Aburto, aburto@marlin.nosc.mil, 26 Sep 1992    */
+/*                                                   */
+/* t = dtime() outputs the current time in seconds.  */
+/* Use CAUTION as some of these routines will mess   */
+/* up when timing across the hour mark!!!            */
+/*                                                   */
+/* For timing I use the 'user' time whenever         */
+/* possible. Using 'user+sys' time is a separate     */
+/* issue.                                            */
+/*                                                   */
+/*****************************************************/
+/*********************************/
+/* Timer code.                   */
+/*********************************/
+/*******************/
+/*  Amiga dtime()  */
+/*******************/
+#ifdef Amiga
+#include <ctype.h>
+#define HZ 50
+double dtime()
+{
+   double q;
+   struct   tt {
+      long  days;
+      long  minutes;
+      long  ticks;
+   } tt;
+   DateStamp(&tt);
+   q = ((double)(tt.ticks + (tt.minutes * 60L * 50L))) / (double)HZ;
+   return q;
+}
+#endif
+/*****************************************************/
+/*  UNIX dtime(). This is the preferred UNIX timer.  */
+/*  Provided by: Markku Kolkka, mk59200@cc.tut.fi    */
+/*  HP-UX Addition by: Bo Thide', bt@irfu.se         */
+/*****************************************************/
+#ifdef UNIX
+#include <sys/time.h>
+#include <sys/resource.h>
+#ifdef __hpux
+#include <sys/syscall.h>
+#define getrusage(a,b) syscall(SYS_getrusage,a,b)
+#endif
+struct rusage rusage;
+double dtime()
+{
+   double q;
+   getrusage(RUSAGE_SELF,&rusage);
+   q = (double)(rusage.ru_utime.tv_sec);
+   q = q + (double)(rusage.ru_utime.tv_usec) * 1.0e-06;
+   return q;
+}
+#endif
+/***************************************************/
+/*  UNIX_Old dtime(). This is the old UNIX timer.  */
+/*  Use only if absolutely necessary as HZ may be  */
+/*  ill defined on your system.                    */
+/***************************************************/
+#ifdef UNIX_Old
+#include <sys/types.h>
+#include <sys/times.h>
+#include <sys/param.h>
+#ifndef HZ
+#define HZ 60
+#endif
+struct tms tms;
+double dtime()
+{
+   double q;
+   times(&tms);
+   q = (double)(tms.tms_utime) / (double)HZ;
+   return q;
+}
+#endif
+/*********************************************************/
+/*  VMS dtime() for VMS systems.                         */
+/*  Provided by: RAMO@uvphys.phys.UVic.CA                */
+/*  Some people have run into problems with this timer.  */
+/*********************************************************/
+#ifdef VMS
+#include time
+#ifndef HZ
+#define HZ 100
+#endif
+struct tbuffer_t
+       {
+	int proc_user_time;
+	int proc_system_time;
+	int child_user_time;
+	int child_system_time;
+       };
+struct tbuffer_t tms;
+double dtime()
+{
+   double q;
+   times(&tms);
+   q = (double)(tms.proc_user_time) / (double)HZ;
+   return q;
+}
+#endif
+/******************************/
+/*  BORLAND C dtime() for DOS */
+/******************************/
+#ifdef BORLAND_C
+#include <ctype.h>
+#include <dos.h>
+#include <time.h>
+#define HZ 100
+struct time tnow;
+double dtime()
+{
+   double q;
+   gettime(&tnow);
+   q = 60.0 * (double)(tnow.ti_min);
+   q = q + (double)(tnow.ti_sec);
+   q = q + (double)(tnow.ti_hund)/(double)HZ;
+   return q;
+}
+#endif
+/**************************************/
+/*  Microsoft C (MSC) dtime() for DOS */
+/**************************************/
+#ifdef MSC
+#include <time.h>
+#include <ctype.h>
+#define HZ CLK_TCK
+clock_t tnow;
+double dtime()
+{
+   double q;
+   tnow = clock();
+   q = (double)tnow / (double)HZ;
+   return q;
+}
+#endif
+/*************************************/
+/*  Macintosh (MAC) Think C dtime()  */
+/*************************************/
+#ifdef MAC
+#include <time.h>
+#define HZ 60
+double dtime()
+{
+   double q;
+   q = (double)clock() / (double)HZ;
+   return q;
+}
+#endif
+/************************************************************/
+/*  iPSC/860 (IPSC) dtime() for i860.                       */
+/*  Provided by: Dan Yergeau, yergeau@gloworm.Stanford.EDU  */
+/************************************************************/
+#ifdef IPSC
+extern double dclock();
+double dtime()
+{
+   double q;
+   q = dclock();
+   return q;
+}
+#endif
+/**************************************************/
+/*  FORTRAN dtime() for Cray type systems.        */
+/*  This is the preferred timer for Cray systems. */
+/**************************************************/
+#ifdef FORTRAN_SEC
+fortran double second();
+double dtime()
+{
+   double q;
+   second(&q);
+   return q;
+}
+#endif
+/***********************************************************/
+/*  UNICOS C dtime() for Cray UNICOS systems.  Don't use   */
+/*  unless absolutely necessary as returned time includes  */
+/*  'user+system' time.  Provided by: R. Mike Dority,      */
+/*  dority@craysea.cray.com                                */
+/***********************************************************/
+#ifdef CTimer
+#include <time.h>
+double dtime()
+{
+   double    q;
+   clock_t   t;
+       t = clock();
+       q = (double)t / (double)CLOCKS_PER_SEC;
+       return q;
+}
+#endif
+/********************************************/
+/* Another UNIX timer using gettimeofday(). */
+/* However, getrusage() is preferred.       */
+/********************************************/
+#ifdef GTODay
+#include <sys/time.h>
+struct timeval tnow;
+double dtime()
+{
+   double q;
+   gettimeofday(&tnow,NULL);
+   q = (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6;
+   return q;
+}
+#endif
+/*****************************************************/
+/*  Fujitsu UXP/M timer.                             */
+/*  Provided by: Mathew Lim, ANUSF, M.Lim@anu.edu.au */
+/*****************************************************/
+#ifdef UXPM
+#include <sys/types.h>
+#include <sys/timesu.h>
+struct tmsu rusage;
+double dtime()
+{
+   double q;
+   timesu(&rusage);
+   q = (double)(rusage.tms_utime) * 1.0e-06;
+   return q;
+}
+#endif
--- a/benchmarks/dhrystone.c
+++ b/benchmarks/dhrystone.c
+/*	EVERBODY:	Please read "APOLOGY" below. -rick 01/06/86
+ *
+ *	"DHRYSTONE" Benchmark Program
+ *
+ *	Version:	C/1.1, 12/01/84
+ *
+ *	Date:		PROGRAM updated 01/06/86, RESULTS updated 02/17/86
+ *
+ *	Author:		Reinhold P. Weicker,  CACM Vol 27, No 10, 10/84 pg. 1013
+ *			Translated from ADA by Rick Richardson
+ *			Every method to preserve ADA-likeness has been used,
+ *			at the expense of C-ness.
+ *
+ *	Compile:	cc -O dry.c -o drynr			: No registers
+ *			cc -O -DREG=register dry.c -o dryr	: Registers
+ *
+ *	Defines:	Defines are provided for old C compiler's
+ *			which don't have enums, and can't assign structures.
+ *			The time(2) function is library dependant; Most
+ *			return the time in seconds, but beware of some, like
+ *			Aztec C, which return other units.
+ *			The LOOPS define is initially set for 50000 loops.
+ *			If you have a machine with large integers and is
+ *			very fast, please change this number to 500000 to
+ *			get better accuracy.  Please select the way to
+ *			measure the execution time using the TIME define.
+ *			For single user machines, time(2) is adequate. For
+ *			multi-user machines where you cannot get single-user
+ *			access, use the times(2) function.  If you have
+ *			neither, use a stopwatch in the dead of night.
+ *			Use a "printf" at the point marked "start timer"
+ *			to begin your timings. DO NOT use the UNIX "time(1)"
+ *			command, as this will measure the total time to
+ *			run this program, which will (erroneously) include
+ *			the time to malloc(3) storage and to compute the
+ *			time it takes to do nothing.
+ *
+ *	Run:		drynr; dryr
+ *
+ *	Results:	If you get any new machine/OS results, please send to:
+ *
+ *				{ihnp4,vax135,..}!houxm!castor!pcrat!rick
+ *
+ *			and thanks to all that do.  Space prevents listing
+ *			the names of those who have provided some of these
+ *			results.  I'll be forwarding these results to
+ *			Rheinhold Weicker.
+ *
+ *	Note:		I order the list in increasing performance of the
+ *			"with registers" benchmark.  If the compiler doesn't
+ *			provide register variables, then the benchmark
+ *			is the same for both REG and NOREG.
+ *
+ *	PLEASE:		Send complete information about the machine type,
+ *			clock speed, OS and C manufacturer/version.  If
+ *			the machine is modified, tell me what was done.
+ *			On UNIX, execute uname -a and cc -V to get this info.
+ *
+ *	80x8x NOTE:	80x8x benchers: please try to do all memory models
+ *			for a particular compiler.
+ *
+ *	APOLOGY (1/30/86):
+ *		Well, I goofed things up!  As pointed out by Haakon Bugge,
+ *		the line of code marked "GOOF" below was missing from the
+ *		Dhrystone distribution for the last several months.  It
+ *		*WAS* in a backup copy I made last winter, so no doubt it
+ *		was victimized by sleepy fingers operating vi!
+ *
+ *		The effect of the line missing is that the reported benchmarks
+ *		are 15% too fast (at least on a 80286).  Now, this creates
+ *		a dilema - do I throw out ALL the data so far collected
+ *		and use only results from this (corrected) version, or
+ *		do I just keep collecting data for the old version?
+ *
+ *		Since the data collected so far *is* valid as long as it
+ *		is compared with like data, I have decided to keep
+ *		TWO lists- one for the old benchmark, and one for the
+ *		new.  This also gives me an opportunity to correct one
+ *		other error I made in the instructions for this benchmark.
+ *		My experience with C compilers has been mostly with
+ *		UNIX 'pcc' derived compilers, where the 'optimizer' simply
+ *		fixes sloppy code generation (peephole optimization).
+ *		But today, there exist C compiler optimizers that will actually
+ *		perform optimization in the Computer Science sense of the word,
+ *		by removing, for example, assignments to a variable whose
+ *		value is never used.  Dhrystone, unfortunately, provides
+ *		lots of opportunities for this sort of optimization.
+ *
+ *		I request that benchmarkers re-run this new, corrected
+ *		version of Dhrystone, turning off or bypassing optimizers
+ *		which perform more than peephole optimization.  Please
+ *		indicate the version of Dhrystone used when reporting the
+ *		results to me.
+ *		
+ * RESULTS BEGIN HERE
+ *
+ *----------------DHRYSTONE VERSION 1.1 RESULTS BEGIN--------------------------
+ *
+ * MACHINE	MICROPROCESSOR	OPERATING	COMPILER	DHRYSTONES/SEC.
+ * TYPE				SYSTEM				NO REG	REGS
+ * --------------------------	------------	-----------	---------------
+ * IBM PC/AT    80286-7.5Mhz    Venix/286 SVR2  cc              1159    1254 *15
+ *
+ *
+ *----------------DHRYSTONE VERSION 1.0 RESULTS BEGIN--------------------------
+ *
+ * MACHINE	MICROPROCESSOR	OPERATING	COMPILER	DHRYSTONES/SEC.
+ * TYPE				SYSTEM				NO REG	REGS
+ * --------------------------	------------	-----------	---------------
+ * Commodore 64	6510-1MHz	C64 ROM		C Power 2.8	  36	  36
+ * HP-110	8086-5.33Mhz	MSDOS 2.11	Lattice 2.14	 284	 284
+ * IBM PC/XT	8088-4.77Mhz	PC/IX		cc		 271	 294
+ * CCC 3205	?		Xelos(SVR2) 	cc		 279	 296
+ * Perq-II	2901 bitslice	Accent S5c 	cc (CMU)	 301	 301
+ * IBM PC/XT	8088-4.77Mhz	COHERENT 2.3.43	MarkWilliams cc  296	 317
+ * Cosmos	68000-8Mhz	UniSoft		cc		 305	 322
+ * IBM PC/XT	8088-4.77Mhz	Venix/86 2.0	cc		 297	 324
+ * DEC PRO 350  11/23           Venix/PRO SVR2  cc               299     325
+ * IBM PC	8088-4.77Mhz	MSDOS 2.0	b16cc 2.0	 310	 340
+ * PDP11/23	11/23           Venix (V7)      cc               320     358
+ * Commodore Amiga		?		Lattice 3.02	 368	 371
+ * PC/XT        8088-4.77Mhz    Venix/86 SYS V  cc               339     377
+ * IBM PC	8088-4.77Mhz	MSDOS 2.0	CI-C86 2.20M	 390	 390
+ * IBM PC/XT	8088-4.77Mhz	PCDOS 2.1	Wizard 2.1	 367	 403
+ * IBM PC/XT	8088-4.77Mhz	PCDOS 3.1	Lattice 2.15	 403	 403 @
+ * Colex DM-6	68010-8Mhz	Unisoft SYSV	cc		 378	 410
+ * IBM PC	8088-4.77Mhz	PCDOS 3.1	Datalight 1.10	 416	 416
+ * IBM PC	NEC V20-4.77Mhz	MSDOS 3.1	MS 3.1 		 387	 420
+ * IBM PC/XT	8088-4.77Mhz	PCDOS 2.1	Microsoft 3.0	 390	 427
+ * IBM PC	NEC V20-4.77Mhz	MSDOS 3.1	MS 3.1 (186) 	 393	 427
+ * PDP-11/34	-		UNIX V7M	cc		 387	 438
+ * IBM PC	8088, 4.77mhz	PC-DOS 2.1	Aztec C v3.2d	 423	 454
+ * Tandy 1000	V20, 4.77mhz	MS-DOS 2.11	Aztec C v3.2d	 423	 458
+ * Tandy TRS-16B 68000-6Mhz	Xenix 1.3.5	cc		 438	 458
+ * PDP-11/34	-		RSTS/E		decus c		 438	 495
+ * Onyx C8002	Z8000-4Mhz	IS/1 1.1 (V7)	cc		 476	 511
+ * CCC 3230			Xelos (SysV.2)	cc		 507	 565
+ * Tandy TRS-16B 68000-6Mhz	Xenix 1.3.5	Green Hills	 609	 617
+ * DEC PRO 380  11/73           Venix/PRO SVR2  cc               577     628
+ * FHL QT+	68000-10Mhz	Os9/68000	version 1.3	 603	 649 FH
+ * Apollo DN550	68010-?Mhz	AegisSR9/IX	cc 3.12		 666	 666
+ * HP-110	8086-5.33Mhz	MSDOS 2.11	Aztec-C		 641	 676 
+ * ATT PC6300	8086-8Mhz	MSDOS 2.11	b16cc 2.0	 632	 684
+ * IBM PC/AT	80286-6Mhz	PCDOS 3.0	CI-C86 2.1	 666	 684
+ * Tandy 6000	68000-8Mhz	Xenix 3.0	cc		 694	 694
+ * IBM PC/AT	80286-6Mhz	Xenix 3.0	cc		 684	 704 MM
+ * Macintosh	68000-7.8Mhz 2M	Mac Rom		Mac C 32 bit int 694	 704
+ * Macintosh	68000-7.7Mhz	-		MegaMax C 2.0	 661	 709
+ * IBM PC/AT	80286-6Mhz	Xenix 3.0	cc		 704	 714 LM
+ * Codata 3300	68000-8Mhz	UniPlus+ (v7)	cc		 678	 725
+ * WICAT MB	68000-8Mhz	System V	WICAT C 4.1	 585	 731 ~
+ * Cadmus 9000	68010-10Mhz	UNIX		cc		 714	 735
+ * AT&T 6300    8086-8Mhz       Venix/86 SVR2   cc               668     743
+ * Cadmus 9790	68010-10Mhz 1MB	SVR0,Cadmus3.7	cc		 720	 747
+ * NEC PC9801F	8086-8Mhz	PCDOS 2.11	Lattice 2.15	 768	  -  @
+ * ATT PC6300	8086-8Mhz	MSDOS 2.11	CI-C86 2.20M	 769	 769
+ * Burroughs XE550 68010-10Mhz	Centix 2.10	cc		 769	 769 CT1
+ * EAGLE/TURBO  8086-8Mhz       Venix/86 SVR2   cc               696     779
+ * ALTOS 586	8086-10Mhz	Xenix 3.0b	cc 		 724	 793
+ * DEC 11/73	J-11 micro	Ultrix-11 V3.0	System V	 735	 793
+ * ATT 3B2/300	WE32000-?Mhz	UNIX 5.0.2	cc		 735	 806
+ * Apollo DN320	68010-?Mhz	AegisSR9/IX	cc 3.12		 806	 806
+ * IRIS-2400	68010-10Mhz	UNIX System V	cc		 772	 829
+ * Atari 520ST  68000-8Mhz      TOS             DigResearch      839     846
+ * IBM PC/AT	80286-6Mhz	PCDOS 3.0	MS 3.0(large)	 833	 847 LM
+ * WICAT MB	68000-8Mhz	System V	WICAT C 4.1	 675	 853 S~
+ * VAX 11/750	-		Ultrix 1.1	4.2BSD cc	 781	 862
+ * CCC  7350A	68000-8MHz	UniSoft V.2	cc		 821	 875
+ * VAX 11/750	-		UNIX 4.2bsd	cc		 862	 877
+ * Fast Mac	68000-7.7Mhz	-		MegaMax C 2.0	 839	 904 +
+ * IBM PC/XT	8086-9.54Mhz	PCDOS 3.1	Microsoft 3.0	 833	 909 C1
+ * DEC 11/44			Ultrix-11 V3.0	System V	 862	 909
+ * Macintosh	68000-7.8Mhz 2M	Mac Rom		Mac C 16 bit int 877	 909 S
+ * CCC 3210	?		Xelos R01(SVR2)	cc		 849	 924
+ * CCC 3220	?               Ed. 7 v2.3      cc		 892	 925
+ * IBM PC/AT	80286-6Mhz	Xenix 3.0	cc -i		 909	 925
+ * AT&T 6300	8086, 8mhz	MS-DOS 2.11	Aztec C v3.2d	 862	 943
+ * IBM PC/AT	80286-6Mhz	Xenix 3.0	cc		 892	 961
+ * VAX 11/750	w/FPA		Eunice 3.2	cc		 914	 976
+ * IBM PC/XT	8086-9.54Mhz	PCDOS 3.1	Wizard 2.1	 892	 980 C1
+ * IBM PC/XT	8086-9.54Mhz	PCDOS 3.1	Lattice 2.15	 980	 980 C1
+ * Plexus P35	68000-10Mhz	UNIX System III cc		 984	 980
+ * PDP-11/73	KDJ11-AA 15Mhz	UNIX V7M 2.1	cc		 862     981
+ * VAX 11/750	w/FPA		UNIX 4.3bsd	cc		 994	 997
+ * IRIS-1400	68010-10Mhz	UNIX System V	cc		 909	1000
+ * IBM PC/AT	80286-6Mhz	Venix/86 2.1	cc		 961	1000
+ * IBM PC/AT	80286-6Mhz	PCDOS 3.0	b16cc 2.0	 943	1063
+ * Zilog S8000/11 Z8001-5.5Mhz	Zeus 3.2	cc		1011	1084
+ * NSC ICM-3216 NSC 32016-10Mhz	UNIX SVR2	cc		1041	1084
+ * IBM PC/AT	80286-6Mhz	PCDOS 3.0	MS 3.0(small)	1063	1086
+ * VAX 11/750	w/FPA		VMS		VAX-11 C 2.0	 958	1091
+ * Stride	68000-10Mhz	System-V/68	cc		1041	1111
+ * Plexus P/60  MC68000-12.5Mhz	UNIX SYSIII	Plexus		1111	1111
+ * ATT PC7300	68010-10Mhz	UNIX 5.2	cc		1041	1111
+ * CCC 3230	?		Xelos R01(SVR2)	cc		1040	1126
+ * Stride	68000-12Mhz	System-V/68	cc		1063	1136
+ * IBM PC/AT    80286-6Mhz      Venix/286 SVR2  cc              1056    1149
+ * Plexus P/60  MC68000-12.5Mhz	UNIX SYSIII	Plexus		1111	1163 T
+ * IBM PC/AT	80286-6Mhz	PCDOS 3.0	Datalight 1.10	1190	1190
+ * ATT PC6300+	80286-6Mhz	MSDOS 3.1	b16cc 2.0	1111	1219
+ * IBM PC/AT	80286-6Mhz	PCDOS 3.1	Wizard 2.1	1136	1219
+ * Sun2/120	68010-10Mhz	Sun 4.2BSD	cc		1136	1219
+ * IBM PC/AT	80286-6Mhz	PCDOS 3.0	CI-C86 2.20M	1219	1219
+ * WICAT PB	68000-8Mhz	System V	WICAT C 4.1	 998	1226 ~
+ * MASSCOMP 500	68010-10MHz	RTU V3.0	cc (V3.2)	1156	1238
+ * Alliant FX/8 IP (68012-12Mhz) Concentrix	cc -ip;exec -i 	1170	1243 FX
+ * Cyb DataMate	68010-12.5Mhz	Uniplus 5.0	Unisoft cc	1162	1250
+ * PDP 11/70	-		UNIX 5.2	cc		1162	1250
+ * IBM PC/AT	80286-6Mhz	PCDOS 3.1	Lattice 2.15	1250	1250
+ * IBM PC/AT	80286-7.5Mhz	Venix/86 2.1	cc		1190	1315 *15
+ * Sun2/120	68010-10Mhz	Standalone	cc		1219	1315
+ * Intel 380	80286-8Mhz	Xenix R3.0up1	cc		1250	1315 *16
+ * Sequent Balance 8000	NS32032-10MHz	Dynix 2.0	cc	1250	1315 N12
+ * IBM PC/DSI-32 32032-10Mhz	MSDOS 3.1	GreenHills 2.14	1282	1315 C3
+ * ATT 3B2/400	WE32100-?Mhz	UNIX 5.2	cc		1315	1315
+ * CCC 3250XP	-		Xelos R01(SVR2)	cc		1215	1318
+ * IBM PC/RT 032 RISC(801?)?Mhz BSD 4.2         cc              1248    1333 RT
+ * DG MV4000	-		AOS/VS 5.00	cc		1333	1333
+ * IBM PC/AT	80286-8Mhz	Venix/86 2.1	cc		1275	1380 *16
+ * IBM PC/AT	80286-6Mhz	MSDOS 3.0	Microsoft 3.0	1250	1388
+ * ATT PC6300+	80286-6Mhz	MSDOS 3.1	CI-C86 2.20M	1428	1428
+ * COMPAQ/286   80286-8Mhz      Venix/286 SVR2  cc              1326    1443
+ * IBM PC/AT    80286-7.5Mhz    Venix/286 SVR2  cc              1333    1449 *15
+ * WICAT PB	68000-8Mhz	System V	WICAT C 4.1	1169	1464 S~
+ * Tandy II/6000 68000-8Mhz	Xenix 3.0	cc      	1384	1477
+ * WICAT MB	68000-12.5Mhz	System V	WICAT C 4.1	1246	1537 ~
+ * IBM PC/AT    80286-9Mhz      SCO Xenix V     cc              1540    1556 *18
+ * Cyb DataMate	68010-12.5Mhz	Uniplus 5.0	Unisoft cc	1470	1562 S
+ * VAX 11/780	-		UNIX 5.2	cc		1515	1562
+ * MicroVAX-II	-		-		-		1562	1612
+ * VAX 11/780	-		UNIX 4.3bsd	cc		1646	1662
+ * Apollo DN660	-		AegisSR9/IX	cc 3.12		1666	1666
+ * ATT 3B20	-		UNIX 5.2	cc		1515	1724
+ * NEC PC-98XA	80286-8Mhz	PCDOS 3.1	Lattice 2.15	1724	1724 @
+ * HP9000-500	B series CPU	HP-UX 4.02	cc		1724	-
+ * IBM PC/STD	80286-8Mhz	MSDOS 3.0 	Microsoft 3.0	1724	1785 C2
+ * WICAT MB	68000-12.5Mhz	System V	WICAT C 4.1	1450	1814 S~
+ * WICAT PB	68000-12.5Mhz	System V	WICAT C 4.1	1530	1898 ~
+ * DEC-2065	KL10-Model B	TOPS-20 6.1FT5	Port. C Comp.	1937	1946
+ * Gould PN6005	-		UTX 1.1(4.2BSD)	cc		1675	1964
+ * DEC2060	KL-10		TOPS-20		cc		2000	2000 &
+ * VAX 11/785	-		UNIX 5.2	cc		2083	2083
+ * VAX 11/785	-		VMS		VAX-11 C 2.0	2083	2083
+ * VAX 11/785	-		UNIX SVR2	cc		2123	2083
+ * VAX 11/785   -               ULTRIX-32 1.1   cc		2083    2091 
+ * VAX 11/785	-		UNIX 4.3bsd	cc		2135	2136
+ * WICAT PB	68000-12.5Mhz	System V	WICAT C 4.1	1780	2233 S~
+ * Pyramid 90x	-		OSx 2.3		cc		2272	2272
+ * Pyramid 90x	FPA,cache,4Mb	OSx 2.5		cc no -O	2777	2777
+ * Pyramid 90x	w/cache		OSx 2.5		cc w/-O		3333	3333
+ * IBM-4341-II	-		VM/SP3		Waterloo C 1.2  3333	3333
+ * IRIS-2400T	68020-16.67Mhz	UNIX System V	cc		3105	3401
+ * Celerity C-1200 ?		UNIX 4.2BSD	cc		3485	3468
+ * SUN 3/75	68020-16.67Mhz	SUN 4.2 V3	cc		3333	3571
+ * IBM-4341	Model 12	UTS 5.0		?		3685	3685
+ * SUN-3/160    68020-16.67Mhz  Sun 4.2 V3.0A   cc		3381    3764
+ * Sun 3/180	68020-16.67Mhz	Sun 4.2		cc		3333	3846
+ * IBM-4341	Model 12	UTS 5.0		?		3910	3910 MN
+ * MC 5400	68020-16.67MHz	RTU V3.0	cc (V4.0)	3952	4054
+ * NCR Tower32  68020-16.67Mhz  SYS 5.0 Rel 2.0 cc              3846	4545
+ * Gould PN9080	-		UTX-32 1.1c	cc		-	4629
+ * MC 5600/5700	68020-16.67MHz	RTU V3.0	cc (V4.0)	4504	4746 %
+ * Gould 1460-342 ECL proc      UTX/32 11/c   c           342   677G1
+* VX 800			UIX .3bd	c		724	088 * AX 600-		MS	VAX11  2.	712	742
+* Aliat F/8 E		oncntrx	c -c;exc - 	652	655FX
+* CI PWER6/3		CS(S+4.)	c		700	800 * CI OWE 6/2		OWE 6 NIXV	c		836	498 * CI OWE 6/2		.2 el.1.2	cc	893	944
+* Serr (CI Pwer6)	4.2SD	cc	934   000
+ *CRA-X-P/1	  105hz	OS .14Cra C       020   020
+ *IBM308	-	UTS5.0Rel1	c	     1666  1250
+  CRY-1	   80hz	TSS	Cry C2.0   1210  1388
+  IB-303	-	VMCMSHPO3.4Watrlo C .2 388   388
+ *Amdhl 70 /8 	UT/V .2     ccv1.3     1550  1550
+  CRY-XMP/8	  10MhzCTS		Cay  2.    1525  1757
+* Adah 58	-	UTS5.0Rel1.2cc 1.5      307   307
+ *Amdhl 860 		TS/ 5.      c v.23     2970  2970 *
+* NTE
+*  *  Crytalchagedfro 'sock tolised alu.
+       hisMacntoh ws ugraed rom128 to512 insuc a ay hat *     th ne 38K o memory is not slowed down by video generator accesses.
+ *   %   Single processor; MC == MASSCOMP
+ *   &   A version 7 C compiler written at New Mexico Tech.
+ *   @   vanilla Lattice compiler used with MicroPro standard library
+ *   S   Shorts used instead of ints
+ *   T	 with Chris Torek's patches (whatever they are).
+ *   ~   For WICAT Systems: MB=MultiBus, PB=Proprietary Bus
+ *   LM  Large Memory Model. (Otherwise, all 80x8x results are small model)
+ *   MM  Medium Memory Model. (Otherwise, all 80x8x results are small model)
+ *   C1  Univation PC TURBO Co-processor; 9.54Mhz 8086, 640K RAM
+ *   C2  Seattle Telecom STD-286 board
+ *   C3  Definicon DSI-32 coprocessor
+ *   C?  Unknown co-processor board?
+ *   CT1 Convergent Technologies MegaFrame, 1 processor.
+ *   MN  Using Mike Newtons 'optimizer' (see net.sources).
+ *   G1  This Gould machine has 2 processors and was able to run 2 dhrystone
+ *       Benchmarks in parallel with no slowdown.
+ *   FH  FHC == Frank Hogg Labs (Hazelwood Uniquad 2 in an FHL box).
+ *   FX  The Alliant FX/8 is a system consisting of 1-8 CEs (computation
+ *	 engines) and 1-12 IPs (interactive processors). Note N8 applies.
+ *   RT  This is one of the RT's that CMU has been using for awhile.  I'm
+ *	 not sure that this is identical to the machine that IBM is selling
+ *	 to the public.
+ *   Nnn This machine has multiple processors, allowing "nn" copies of the
+ *	 benchmark to run in the same time as 1 copy.
+ *   ?   I don't trust results marked with '?'.  These were sent to me with
+ *       either incomplete info, or with times that just don't make sense.
+ *	 ?? means I think the performance is too poor, ?! means too good.
+ *       If anybody can confirm these figures, please respond.
+ *
+ *  ABBREVIATIONS
+ *	CCC	Concurrent Computer Corp. (was Perkin-Elmer)
+ *	MC	Masscomp
+ *
+ *--------------------------------RESULTS END----------------------------------
+ *
+ *	The following program contains statements of a high-level programming
+ *	language (C) in a distribution considered representative:
+ *
+ *	assignments			53%
+ *	control statements		32%
+ *	procedure, function calls	15%
+ *
+ *	100 statements are dynamically executed.  The program is balanced with
+ *	respect to the three aspects:
+ *		- statement type
+ *		- operand type (for simple data types)
+ *		- operand access
+ *			operand global, local, parameter, or constant.
+ *
+ *	The combination of these three aspects is balanced only approximately.
+ *
+ *	The program does not compute anything meaningfull, but it is
+ *	syntactically and semantically correct.
+ *
+ */
+/* Accuracy of timings and human fatigue controlled by next two lines */
+/*#define LOOPS	50000		/* Use this for slow or 16 bit machines */
+#define LOOPS	5000		/* Use this for faster machines */
+/* Compiler dependent options */
+#undef	NOENUM			/* Define if compiler has no enum's */
+#undef	NOSTRUCTASSIGN		/* Define if compiler can't assign structures */
+/* define only one of the next two defines */
+#define TIMES			/* Use times(2) time function */
+/*#define TIME			/* Use time(2) time function */
+/* define the granularity of your times(2) function (when used) */
+/*#define HZ	60		/* times(2) returns 1/60 second (most) */
+#define HZ	100		/* times(2) returns 1/100 second (WECo) */
+/* for compatibility with goofed up version */
+/*#define GOOF			/* Define if you want the goofed up version */
+#ifdef GOOF
+char	Version[] = "1.0";
+#else
+char	Version[] = "1.1";
+#endif
+#ifdef	NOSTRUCTASSIGN
+#define	structassign(d, s)	memcpy(&(d), &(s), sizeof(d))
+#else
+#define	structassign(d, s)	d = s
+#endif
+#ifdef	NOENUM
+#define	Ident1	1
+#define	Ident2	2
+#define	Ident3	3
+#define	Ident4	4
+#define	Ident5	5
+typedef int	Enumeration;
+#else
+typedef enum	{Ident1, Ident2, Ident3, Ident4, Ident5} Enumeration;
+#endif
+typedef int	OneToThirty;
+typedef int	OneToFifty;
+typedef char	CapitalLetter;
+typedef char	String30[31];
+typedef int	Array1Dim[51];
+typedef int	Array2Dim[51][51];
+struct	Record
+{
+	struct Record		*PtrComp;
+	Enumeration		Discr;
+	Enumeration		EnumComp;
+	OneToFifty		IntComp;
+	String30		StringComp;
+};
+typedef struct Record 	RecordType;
+typedef RecordType *	RecordPtr;
+typedef int		boolean;
+#define	NULL		0
+#define	TRUE		1
+#define	FALSE		0
+#ifndef REG
+#define	REG
+#endif
+extern Enumeration	Func1();
+extern boolean		Func2();
+#ifdef TIMES
+#include <sys/types.h>
+#include <sys/times.h>
+#endif
+main()
+{
+	Proc0();
+	exit(0);
+}
+/*
+ * Package 1
+ */
+int		IntGlob;
+boolean		BoolGlob;
+char		Char1Glob;
+char		Char2Glob;
+Array1Dim	Array1Glob;
+Array2Dim	Array2Glob;
+RecordPtr	PtrGlb;
+RecordPtr	PtrGlbNext;
+Proc0()
+{
+	OneToFifty		IntLoc1;
+	REG OneToFifty		IntLoc2;
+	OneToFifty		IntLoc3;
+	REG char		CharLoc;
+	REG char		CharIndex;
+	Enumeration	 	EnumLoc;
+	String30		String1Loc;
+	String30		String2Loc;
+	extern char		*malloc();
+#ifdef TIME
+	long			time();
+	long			starttime;
+	long			benchtime;
+	long			nulltime;
+	register unsigned int	i;
+	starttime = time( (long *) 0);
+	for (i = 0; i < LOOPS; ++i);
+	nulltime = time( (long *) 0) - starttime; /* Computes o'head of loop */
+#endif
+#ifdef TIMES
+	time_t			starttime;
+	time_t			benchtime;
+	time_t			nulltime;
+	struct tms		tms;
+	register unsigned int	i;
+	times(&tms); starttime = tms.tms_utime;
+	for (i = 0; i < LOOPS; ++i);
+	times(&tms);
+	nulltime = tms.tms_utime - starttime; /* Computes overhead of looping */
+#endif
+	PtrGlbNext = (RecordPtr) malloc(sizeof(RecordType));
+	PtrGlb = (RecordPtr) malloc(sizeof(RecordType));
+	PtrGlb->PtrComp = PtrGlbNext;
+	PtrGlb->Discr = Ident1;
+	PtrGlb->EnumComp = Ident3;
+	PtrGlb->IntComp = 40;
+	strcpy(PtrGlb->StringComp, "DHRYSTONE PROGRAM, SOME STRING");
+#ifndef	GOOF
+	strcpy(String1Loc, "DHRYSTONE PROGRAM, 1'ST STRING");	/*GOOF*/
+#endif
+	Array2Glob[8][7] = 10;	/* Was missing in published program */
+/*****************
+-- Start Timer --
+*****************/
+#ifdef TIME
+	starttime = time( (long *) 0);
+#endif
+#ifdef TIMES
+	times(&tms); starttime = tms.tms_utime;
+#endif
+	for (i = 0; i < LOOPS; ++i)
+	{
+		Proc5();
+		Proc4();
+		IntLoc1 = 2;
+		IntLoc2 = 3;
+		strcpy(String2Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
+		EnumLoc = Ident2;
+		BoolGlob = ! Func2(String1Loc, String2Loc);
+		while (IntLoc1 < IntLoc2)
+		{
+			IntLoc3 = 5 * IntLoc1 - IntLoc2;
+			Proc7(IntLoc1, IntLoc2, &IntLoc3);
+			++IntLoc1;
+		}
+		Proc8(Array1Glob, Array2Glob, IntLoc1, IntLoc3);
+		Proc1(PtrGlb);
+		for (CharIndex = 'A'; CharIndex <= Char2Glob; ++CharIndex)
+			if (EnumLoc == Func1(CharIndex, 'C'))
+				Proc6(Ident1, &EnumLoc);
+		IntLoc3 = IntLoc2 * IntLoc1;
+		IntLoc2 = IntLoc3 / IntLoc1;
+		IntLoc2 = 7 * (IntLoc3 - IntLoc2) - IntLoc1;
+		Proc2(&IntLoc1);
+	}
+/*****************
+-- Stop Timer --
+*****************/
+#ifdef TIME
+	benchtime = time( (long *) 0) - starttime - nulltime;
+	printf("Dhrystone(%s) time for %ld passes = %ld\n",
+		Version,
+		(long) LOOPS, benchtime);
+	printf("This machine benchmarks at %ld dhrystones/second\n",
+		((long) LOOPS) / benchtime);
+#endif
+#ifdef TIMES
+	times(&tms);
+	benchtime = tms.tms_utime - starttime - nulltime;
+	printf("Dhrystone(%s) time for %ld passes = %ld\n",
+		Version,
+		(long) LOOPS, benchtime/HZ);
+	printf("This machine benchmarks at %ld dhrystones/second\n",
+		((long) LOOPS) * HZ / benchtime);
+#endif
+}
+Proc1(PtrParIn)
+REG RecordPtr	PtrParIn;
+{
+#define	NextRecord	(*(PtrParIn->PtrComp))
+	structassign(NextRecord, *PtrGlb);
+	PtrParIn->IntComp = 5;
+	NextRecord.IntComp = PtrParIn->IntComp;
+	NextRecord.PtrComp = PtrParIn->PtrComp;
+	Proc3(NextRecord.PtrComp);
+	if (NextRecord.Discr == Ident1)
+	{
+		NextRecord.IntComp = 6;
+		Proc6(PtrParIn->EnumComp, &NextRecord.EnumComp);
+		NextRecord.PtrComp = PtrGlb->PtrComp;
+		Proc7(NextRecord.IntComp, 10, &NextRecord.IntComp);
+	}
+	else
+		structassign(*PtrParIn, NextRecord);
+#undef	NextRecord
+}
+Proc2(IntParIO)
+OneToFifty	*IntParIO;
+{
+	REG OneToFifty		IntLoc;
+	REG Enumeration		EnumLoc;
+	IntLoc = *IntParIO + 10;
+	for(;;)
+	{
+		if (Char1Glob == 'A')
+		{
+			--IntLoc;
+			*IntParIO = IntLoc - IntGlob;
+			EnumLoc = Ident1;
+		}
+		if (EnumLoc == Ident1)
+			break;
+	}
+}
+Proc3(PtrParOut)
+RecordPtr	*PtrParOut;
+{
+	if (PtrGlb != NULL)
+		*PtrParOut = PtrGlb->PtrComp;
+	else
+		IntGlob = 100;
+	Proc7(10, IntGlob, &PtrGlb->IntComp);
+}
+Proc4()
+{
+	REG boolean	BoolLoc;
+	BoolLoc = Char1Glob == 'A';
+	BoolLoc |= BoolGlob;
+	Char2Glob = 'B';
+}
+Proc5()
+{
+	Char1Glob = 'A';
+	BoolGlob = FALSE;
+}
+extern boolean Func3();
+Proc6(EnumParIn, EnumParOut)
+REG Enumeration	EnumParIn;
+REG Enumeration	*EnumParOut;
+{
+	*EnumParOut = EnumParIn;
+	if (! Func3(EnumParIn) )
+		*EnumParOut = Ident4;
+	switch (EnumParIn)
+	{
+	case Ident1:	*EnumParOut = Ident1; break;
+	case Ident2:	if (IntGlob > 100) *EnumParOut = Ident1;
+			else *EnumParOut = Ident4;
+			break;
+	case Ident3:	*EnumParOut = Ident2; break;
+	case Ident4:	break;
+	case Ident5:	*EnumParOut = Ident3;
+	}
+}
+Proc7(IntParI1, IntParI2, IntParOut)
+OneToFifty	IntParI1;
+OneToFifty	IntParI2;
+OneToFifty	*IntParOut;
+{
+	REG OneToFifty	IntLoc;
+	IntLoc = IntParI1 + 2;
+	*IntParOut = IntParI2 + IntLoc;
+}
+Proc8(Array1Par, Array2Par, IntParI1, IntParI2)
+Array1Dim	Array1Par;
+Array2Dim	Array2Par;
+OneToFifty	IntParI1;
+OneToFifty	IntParI2;
+{
+	REG OneToFifty	IntLoc;
+	REG OneToFifty	IntIndex;
+	IntLoc = IntParI1 + 5;
+	Array1Par[IntLoc] = IntParI2;
+	Array1Par[IntLoc+1] = Array1Par[IntLoc];
+	Array1Par[IntLoc+30] = IntLoc;
+	for (IntIndex = IntLoc; IntIndex <= (IntLoc+1); ++IntIndex)
+		Array2Par[IntLoc][IntIndex] = IntLoc;
+	++Array2Par[IntLoc][IntLoc-1];
+	Array2Par[IntLoc+20][IntLoc] = Array1Par[IntLoc];
+	IntGlob = 5;
+}
+Enumeration Func1(CharPar1, CharPar2)
+CapitalLetter	CharPar1;
+CapitalLetter	CharPar2;
+{
+	REG CapitalLetter	CharLoc1;
+	REG CapitalLetter	CharLoc2;
+	CharLoc1 = CharPar1;
+	CharLoc2 = CharLoc1;
+	if (CharLoc2 != CharPar2)
+		return (Ident1);
+	else
+		return (Ident2);
+}
+boolean Func2(StrParI1, StrParI2)
+String30	StrParI1;
+String30	StrParI2;
+{
+	REG OneToThirty		IntLoc;
+	REG CapitalLetter	CharLoc;
+	IntLoc = 1;
+	while (IntLoc <= 1)
+		if (Func1(StrParI1[IntLoc], StrParI2[IntLoc+1]) == Ident1)
+		{
+			CharLoc = 'A';
+			++IntLoc;
+		}
+	if (CharLoc >= 'W' && CharLoc <= 'Z')
+		IntLoc = 7;
+	if (CharLoc == 'X')
+		return(TRUE);
+	else
+	{
+		if (strcmp(StrParI1, StrParI2) > 0)
+		{
+			IntLoc += 7;
+			return (TRUE);
+		}
+		else
+			return (FALSE);
+	}
+}
+boolean Func3(EnumParIn)
+REG Enumeration	EnumParIn;
+{
+	REG Enumeration	EnumLoc;
+	EnumLoc = EnumParIn;
+	if (EnumLoc == Ident3) return (TRUE);
+	return (FALSE);
+}
+#ifdef	NOSTRUCTASSIGN
+memcpy(d, s, l)
+register char	*d;
+register char	*s;
+register int	l;
+{
+	while (l--) *d++ = *s++;
+}
+#endif
+#if 0
+IntLoc;
+		}
+	if (CharLoc >= 'W' && CharLoc <= 'Z')
+		IntLoc = 7;
+	if (CharLoc == 'X')
+		return(TRUE);
+	else
+	{
+		if (strcmp(StrParI1, StrParI2) > 0)
+		{
+			IntLoc += 7;
+			return (TRUE);
+		}
+		else
+			return (FALSE);
+	}
+}
+#endif
--- a/benchmarks/geom
+++ b/benchmarks/geom
+ 13.50e+0   9.00e+0   8.00e+0                               Box dimensions
+  0.80e+0   0.99e+0   0.54e+0   0.84e+0   0.01e+0   0.84e+0
+  0.80e+0   0.01e+0   0.54e+0   0.84e+0   0.01e+0   0.84e+0
+  0.80e+0   0.01e+0   0.54e+0   0.84e+0   0.99e+0   0.84e+0 Reflectivities (RGB)
+  1.27e+0   0.00e+0   0.00e+0   0.00e+0   0.00e+0   0.00e+0
+  1.27e+0   0.00e+0   0.00e+0   0.00e+0   0.00e+0   0.00e+0
+  1.27e+0   0.00e+0   0.00e+0   0.00e+0   0.00e+0   0.00e+0 Emissivities (RGB)
--- a/benchmarks/pi.c
+++ b/benchmarks/pi.c
+#include <stdio.h>
+#include <stdlib.h>
+int main(int argc, char *argv[])
+{
+   int i, its, hits = 0;
+   double d1, d2;
+   if (argc != 2) {
+      fprintf(stderr, "Usage: %s <iterations>\n", argv[0]);
+      exit(0);
+   }
+   its = atoi(argv[1]);
+   srandom(1);
+   for (i = 0; i < its; i++) {
+      d1 = ((double)random())/2147483647.0;
+      d2 = ((double)random())/2147483647.0;
+      if (((d1*d1) + (d2*d2)) <= 1)
+	 hits++;
+   }
+   printf("%.10f\n", (double)4.0 * (double) ((double)hits / (double)its));
+   return 1;
+}
--- a/benchmarks/slalom.c
+++ b/benchmarks/slalom.c
+/******************************************************************************
+                               S L A L O M
+    Scalable Language-independent Ames Laboratory One-minute Measurement
+     The following program is the first benchmark based on fixed time rather
+  than fixed problem comparison.  Not only is fixed time more representative
+  of the way people use computers, it also greatly increases the scope and
+  longevity of the benchmark.  SLALOM is very scalable, and can be used to
+  compare computers as slow as 126 floating-point operations per second
+  (FLOPS) to computers running a trillion times faster.  The scalability can
+  be used to compare single processors to massively parallel collections
+  of processors, and to study the space of problem size vs. ensemble size
+  in fine detail.  It resembles the LINPACK benchmark since it involves
+  factoring and backsolving a (nearly) dense matrix, but incorporates a
+  number of improvements to that benchmark that we hope will make SLALOM
+  a better reflection of general system performance.
+     The SLALOM benchmark solves a complete, real problem (optical radiosity
+  on the interior of a box), not a contrived kernel or a synthetic mixture of
+  sample operations.  SLALOM is unusual since it times input, problem setup,
+  solution, and output, not just the solution.  For slower computers, the
+  problem setup will take the majority of the time; it grows as the square of
+  the problem size.  The solver grows as the cube of the problem size, and
+  dominates the time for large values of n.
+     While the following is C, you are free to translate it into any
+  language you like, including assembly language specific to one computer.
+  You may use compiler directives, hand-tuned library calls, loop unrolling,
+  and even change the algorithm, if you can provide a convincing argument
+  that the program still works for the full range of possible inputs.  For
+  example, if you replace the direct solver with an iterative one, you must
+  make sure your method is correct even when the geometry is quite eccentric
+  and the box faces are highly reflective. (rho = .999)
+     The main() driver should be used with the value of 60 seconds for the
+  SLALOM benchmark.  The work done for a particular problem size is figured
+  after timing has ceased, so there is no overhead for work assessment.  The
+  residual check ||Ax - b|| is also done after timing has ceased.  Two
+  computers may be compared either by their problem size n, or by their MFLOPS
+  rate, never by the ratio of execution times.  Times will always be near one
+  minute in SLALOM.  We have used the following weights for floating-point
+  operation counting, based on the weights used by Lawrence Livermore National
+  Laboratory:
+                        OPERATION                       WEIGHT
+                    a=b, a=(constant)                      0
+            a<0, a<=0, a==0, a!=0, a>0, a>=0               0
+                 -a, fabs(a), fsgn(a, b)                   0
+                   a+b, a-b, a*b, a^2                      1
+            a<b, a<=b, a==b, a!=b, a>b, a>=b               1
+                   (int) a, (double)b                      1
+                        1/a, -1/a                          3
+                           a/b                             4
+                          sqrt(a)                          4
+               Format to or from ASCII string              6
+       sin(a), cos(a), tan(a), log(a), atan(a), exp(a)     8
+     We invite you to share with us the results of any measurements that you
+  make with SLALOM.  We do NOT accept anonymous data; machine timings will be
+  referenced and dated.
+     The least you need to do to adapt SLALOM to your computer is:
+        1.  In the "Measure" routine, set NMAX to a value large enough to keep
+            the computer working for a minute.  Vary it slightly if it helps
+            (for reasons of cache size, interleaving, etc.)
+        2.  Replace the timer call in "When" with the most accurate wall-clock
+            timer at your disposal.  If only CPU time is available, try to run
+            the job standalone or at high priority, since we are ultimately
+            interested in the top of the statistical range of performance.
+        3.  Edit in the information specific to your test in the "What"
+            routine, so that final output will be automatically annotated.
+        4.  Compile, link, and run the program, interacting to select values
+            of n that bracket a time of one minute.  Once everything is
+            running, run it as a batch job so as to record the session.
+     Examples of ways you may optimize performance:
+        1.  Unroll the loops in SetUp1 and SetUp2; it is possible to
+            vectorize both SetUp1 and SetUp2 at the cost of some extra
+            operations, program complexity, and storage.
+        2.  Replace the innermost loops of Solver with calls to well-tuned
+            libraries of linear algebra routines, such as DDOT from the
+            Basic Linear Algebra Subroutines (level 1 BLAS).  Better still,
+            use a tuned library routine for all of Solver; the sparsity
+            exploited in Solver is only a few percent, so you will usually
+            gain more than you lose by applying a dense symmetric solver.
+        3.  Parallelize the SetUp and Solver routines; all are highly
+            parallel.  Each element of the matrix can be constructed
+            independently, once each processor knows the geometry and part of
+            the partitioning into regions.  A substantial body of literature
+            now exists for performing the types of operations in Solver in
+            parallel.
+        4.  Overlap computation with output.  Once the Region routine is done,
+            the first part of the output file (patch geometry) can be written
+            while the radiosities are being calculated.
+     Examples of what you may NOT do:
+        1.  The tuning must not be made specific to the particular input
+            provided.  For example, you may not eliminate IF tests simply
+            because they always come out the same way for this input; you
+            may not use precomputed answers or table look-up unless those
+            answers and tables cover the full range of possible inputs; and
+            you may not exploit symmetry for even values of the problem size.
+        2.  You may not disable the self-consistency tests in SetUp3 and
+            Verify, nor alter their tolerance constants.
+        3.  You may not change the input or output files to unformatted
+            binary or other format that would render them difficult to create
+            or read for humans.
+        4.  You may not eliminate the reading of the "geom" file by putting
+            its data directly into the compiled program.
+        5.  You may not change any of the work assessments in Meter.  If you
+            use more floating-point operations than indicated, you must still
+            use the assessments provided.  If you find a way to use fewer
+            operations and still get the job done for arbitrary input
+            parameters, please tell us!
+                          -John Gustafson, Diane Rover, Michael Carter,
+                           and Stephen Elbert
+                           Ames Laboratory, Ames, Iowa 50011
+******************************************************************************/
+/*****************************************************************************/
+/*  The following program finds a value n such that a problem of size n      */
+/*  takes just under "goal" seconds to execute.                              */
+/*                                                                           */
+/*  John Gustafson, Diane Rover, Michael Carter, and Stephen Elbert          */
+/*  Ames Laboratory, 3/18/90                                                 */
+/*                                                                           */
+/*  Calls:  Meter   Measures execution time for some application.            */
+/*          What    Prints work-timing statistics and system information.    */
+/*****************************************************************************/
+#include				<stdio.h>
+#include				<math.h>
+#include				<sys/time.h>
+/* NMAX = Largest npatch for your computer; adjust as needed. */
+#define		NMAX		2048
+#define		EPS			(0.5e-8)
+#define		FALSE		(1==0)
+#define		TRUE		(!FALSE)
+#define		MAX(a,b)	(((a) > (b)) ? (a) : (b))
+/* Global variables and function return types: */
+double 	goal,		/* User input, fixed-time benchmark goal, in seconds. */
+		timing,		/* Elapsed time returned by Meter routine, in seconds.*/
+		work,		/* In this case, number of FLOPs performed.           */
+		When(),		/* Wall clock in seconds.                             */
+		Ddot();		/* Double dot product.                                */
+int		mean,		/* Avg between upper and lower bounds for bisection   */
+					/* method.                                            */
+		n,			/* The problem size.                                  */
+		nupper,		/* Upper bound on problem size, used in iterating     */
+					/* toward goal.                                       */
+		Meter(),	/* Driver for following benchmark functions.          */
+		Reader (),	/* Reads problem description from 'geom' file.        */
+		Region (),	/* Subdivides box faces into patches.                 */
+		SetUp3 (),	/* Set up matrix to solve.                            */
+		Storer (),	/* Write result to 'answer' file.                     */
+		Verify ();	/* Verify the radiosity solution from solver.         */
+void	SetUp1 (),	/* Set up matrix to solve.                            */
+		SetUp2 (),	/* Set up matrix to solve.                            */
+		Solver ();	/* Solve the radiosity matrix.                        */
+main ()
+{
+	int		ok;			/* Return code temporary storage.       */
+	/* Get desired number of seconds: */
+	printf ("Enter the number of seconds that is the goal: ");
+	scanf ("%lg", &goal);
+	/* Get lower and upper bounds for n from the standard input device: */
+	do {
+		printf ("Enter a lower bound for n: ");
+		scanf ("%d", &n);
+		if (n <= 0)
+			exit(0);
+		ok = Meter (n, &timing, &work);
+		if (timing >= goal)
+			printf ("Must take less than %g seconds.  Took %g.\n",
+			  goal, timing);
+	} while (!ok || timing >= goal);
+	do {
+		printf ("Enter an upper bound for n: ");
+		scanf ("%d", &nupper);
+		if (nupper <= 0)
+			exit(0);
+		ok = Meter (nupper, &timing, &work);
+		if (timing < goal) {
+			printf ("Must take at least %g seconds.  Took %g.\n",
+			  goal, timing);
+			n = MAX(nupper, n);
+		}
+	} while (!ok || timing < goal);
+	/*
+	 *  While the [n, nupper] interval is larger than 1, bisect it and
+	 *  pick a half:
+	 */
+	while (nupper - n > 1) {
+		mean = (n + nupper) / 2;
+		ok = Meter (mean, &timing, &work);
+		if (timing < goal)
+			n = mean;
+		else
+			nupper = mean;
+		printf ("New interval: [%d,%d]\n", n, nupper);
+	}
+	/* Ensure that most recent run was for n, not nupper. */
+	ok = Meter (n, &timing, &work);
+	/* Print out final statistics. */
+	What (n, timing, work);
+}
+/*****************************************************************************/
+/* This routine should be edited to contain information for your system.     */
+/*****************************************************************************/
+What (n, timing, work)
+int n;
+double timing, work;
+{
+	int			i;
+	static char *info[] = {
+		"Machine:  SUN 4/370GX          Processor:  SPARC",
+		"Memory:   32 MB                # of procs: 1",
+		"Cache:    128 KB               # used:     1",
+		"NMAX:     512                  Clock:      25 MHz",
+		"Disk:     .3GB SCSI+.7GB SMD   Node name:  amssun2",
+		"OS:       SUNOS 4.0.3          Timer:      Wall, gettimeofday()",
+		"Language: C                    Alone:      yes",
+		"Compiler: cc                   Run by:     M. Carter",
+		"Options:  -O                   Date:       23 May 1990",
+		NULL
+	};
+	printf ("\n");
+	for (i = 0 ; info[i] ; i++)
+		puts (info[i]);
+	printf ("M ops:    %-13lg        Time:       %-.3lf seconds\n",
+	  work * 1e-6, timing);
+	printf ("n:        %-6d               MFLOPS:     %-.5lg\n",
+	  n, (work / timing) * 1e-6);
+	printf ("Approximate data memory use: %d bytes.\n",
+	  8 * n * n + 120 * n + 800);
+}
+/*****************************************************************************/
+/*  This routine measures time required on a revised LINPACK-type benchmark, */
+/*  including input, matrix generation, solution, and output.                */
+/*                                                                           */
+/*  John Gustafson, Diane Rover, Michael Carter, and Stephen Elbert          */
+/*  Ames Laboratory, 3/18/90                                                 */
+/*                                                                           */
+/*  Calls: Reader  Reads the problem description from secondary storage.     */
+/*         Region  Partitions box surface into rectangular regions (patches).*/
+/*         SetUp1  Sets up equations from patch geometries-parallel faces.   */
+/*         SetUp2  Sets up equations from patch geometries-orthogonal faces. */
+/*         SetUp3  Sets up equations-row normalization and radiant props.    */
+/*         Solver  Solves the equations by LDL factorization.                */
+/*         Storer  Stores solution (patch radiosities) on secondary storage. */
+/*         When    Returns wall-clock time, in seconds.                      */
+/*****************************************************************************/
+Meter (npatch, timing, work)
+int		npatch;		/* In, problem size, here the number of equations. */
+double	*timing,	/* Out, number of seconds measured.                */
+		*work;		/* Out, work done, here the number of FLOPs.       */
+{
+	static
+	double	area[NMAX],			/* Areas of patches * 8 * pi.                */
+			box[7],				/* Dimensions of box in x, y, z directions.  */
+			coeff[NMAX][NMAX],	/* The coefficients of the eqns to solve.    */
+			diag[3][NMAX],		/* Diag terms of the eqns to solve. (RGB)    */
+			emiss[6][3],		/* (RGB) emissivities of patches.            */
+			place[3][NMAX],		/* Width-height-depth position of patches.   */
+			result[3][NMAX],	/* Answer radiosities (RGB).                 */
+			rho[6][3],			/* (RGB) Reflectivities of patches.          */
+			rhs[3][NMAX],		/* Right-hand sides of eqns to solve (RGB).  */
+			size[2][NMAX];		/* Width-height sizes of patches.            */
+	double	ops[8],				/* Floating-point operation counts.          */
+			p[6],				/* Number of patches in faces.               */
+			sec[8],				/* Times for routines, in seconds.           */
+			tmp1, tmp2;			/* Double temporary variables.               */
+	int		i,					/* Loop counter.                             */
+			itmp1,				/* Integer temporary variable.               */
+			non0;				/* Index of first nonzero off-diagonal elem. */
+	static
+	int		loop[6][2];			/* Patch number ranges for faces.            */
+	static char *tasks[] = {	/* Names of all the functions in benchmark.  */
+		"Reader", "Region",
+		"SetUp1", "SetUp2",
+		"SetUp3", "Solver",
+		"Storer"
+	};
+	static char *format =		/* Output line format.                       */
+		"%6.6s%8.3f%17.0f%14.6f%10.1f %%\n";
+	/* First check that npatch lies between 6 and NMAX: */
+	if (npatch < 6) {
+		printf ("Must be at least 6, the number of faces.\n");
+		return (FALSE);
+	}
+	else if (npatch > NMAX) {
+		printf ("Exceeds %d = maximum for this system.\n", NMAX);
+		return (FALSE);
+	}
+	/* Ensure that previous 'answer' file is deleted: */
+	unlink ("answer");
+	/* Time the tasks, individually and collectively.  */
+	sec[0] = When();
+	if (!Reader (box, rho, emiss))
+		return (FALSE);
+	sec[1] = When();
+	if (!Region (npatch, loop, box, place, size, area))
+		return (FALSE);
+	sec[2] = When();
+	SetUp1 (npatch, loop, coeff, place, size);
+	sec[3] = When();
+	SetUp2 (npatch, loop, coeff, place, size);
+	sec[4] = When();
+	if (!SetUp3 (npatch, loop, area, rho, emiss, coeff, diag, rhs))
+		return (FALSE);
+	sec[5] = When();
+	non0 = loop[1][0];
+	Solver (npatch, non0, coeff, diag, rhs, result);
+	sec[6] = When();
+	Storer (npatch, loop, place, size, result);
+	sec[7] = When();
+	*timing = sec[7] - sec[0];
+	for (i = 0 ; i < 7 ; i++)
+		sec[i] = sec[i+1] - sec[i];
+	/* Assess floating-point work done by each routine called, and total: */
+	/* Note the ops counts are talleyed into a double array, and there    */
+	/* some strange casts to double in some equations.  This is to        */
+	/* prevent integer overflow.                                          */
+	itmp1 = 0;
+	tmp1 = 0.0;
+	for (i = 0 ; i < 6 ; i++) {
+        p[i] = loop[i][1] - loop[i][0] + 1;
+        tmp1 += p[i] * p[i];
+        itmp1 += sqrt(p[i] * box[i] / box[i + 1]) + 0.5;
+	}
+	tmp2 = p[0] * p[3] + p[1] * p[4] + p[2] * p[5];
+	ops[0] = 258;
+	ops[1] = 154 + (double) 8 * itmp1 + npatch;
+	ops[2] = 6 + 532 * tmp2;
+	ops[3] = 8*npatch + 370 * ((double) npatch * npatch - tmp1 - 2*tmp2) / 2.0;
+	ops[4] = 72 + (double) 9 * npatch + (double) npatch * npatch - tmp1;
+	ops[5] = npatch * (npatch * ((double) npatch + 7.5) - 2.5) - 21
+			+ (non0+1) * ((non0+1) * (2 * ((double) non0+1) - 16.5) + 35.5)
+			+ (non0+1) * npatch * (9 - 3 * ((double) non0+1));
+	ops[6] = 48 * npatch;
+	*work = ops[0] + ops[1] + ops[2] + ops[3] + ops[4] + ops[5] + ops[6];
+	/* Display timing-work-speed breakdown by routine. */
+	printf ("%d patches:\n", npatch);
+	printf (" Task  Seconds       Operations        MFLOPS    %% of Time\n");
+	for (i = 0 ; i < 7 ; i++) {
+		if (sec[i] == 0.0)
+			sec[i] = 0.001;
+		printf (format, tasks[i], sec[i], ops[i], (ops[i] / sec[i]) * 1e-6,
+			100.0 * sec[i] / *timing);
+	}
+	printf (format, "TOTALS", *timing, *work, (*work / *timing) * 1e-6, 100.0);
+	Verify (npatch, coeff, diag, rhs, result);
+	return (TRUE);
+}
+/*****************************************************************************/
+/*  This function should return the actual, wall clock time (not CPU time)   */
+/*  in seconds as accurately as possible.  Change it to your system timer.   */
+/*****************************************************************************/
+double
+When()
+{
+	struct timeval tp;
+	struct timezone tzp;
+	gettimeofday (&tp, &tzp);
+	return ((double) tp.tv_sec + (double) tp.tv_usec * 1e-6);
+}
+/*****************************************************************************/
+/* The following routine reads in the problem description from secondary     */
+/* storage, and checks that numbers are in reasonable ranges.                */
+/*****************************************************************************/
+Reader (box, rho, emiss)
+double	box[],			/* Out: Dimensions of box in x, y, z directions.  */
+		rho[][3],		/* Out: (RGB) Reflectivities of patches.          */
+		emiss[][3];		/* Out: (RGB) emissivities of patches.            */
+{
+	/*
+	 *  Local variables:
+	 *    infile  Device number for input file.
+	 *    i, j    Loop counters.
+	 *    tmp1    Maximum emissivity, to check that emissivities are not all 0.
+	 */
+	int		i, j,		/* Loop counters.                            */
+			n;			/* Number of args fscanf()'ed from file.     */
+	double	tmp1;		/* Maximum emissivity.                       */
+	FILE	*infile;	/* Input file pointer.                       */
+	char	buff[81];	/* Buffer used to eat a line of input.       */
+	/* Open the input file and read in the data. */
+	if ((infile = fopen ("geom", "r")) == NULL) {
+		printf ("slalom:  'geom' geometry file not found.\n");
+		exit (1);
+	}
+	/* Read the box coordinates and error check. */
+	n = 0;
+	for (i = 0 ; i < 3 ; i++) {
+		n += fscanf (infile, "%lg", &box[i]);
+	}
+	fgets (buff, 80, infile);		/* Eat the rest of the line. */
+	if (n != 3) {
+		printf ("Must specify exactly 3 box coordinates.\n");
+		exit(1);
+	}
+	/* Read the reflectivities and error check. */
+	n = 0;
+	for (j = 0 ; j < 3 ; j++) {
+		for (i = 0 ; i < 6 ; i++) {
+			n += fscanf (infile, "%lg", &rho[i][j]);
+		}
+	}
+	fgets (buff, 80, infile);		/* Eat the rest of the line. */
+	if (n != 18) {
+		printf ("Must specify exactly 18 box coordinates.\n");
+		exit(1);
+	}
+	/* Read the emissivities and error check. */
+	n = 0;
+	for (j = 0 ; j < 3 ; j++) {
+		for (i = 0 ; i < 6 ; i++) {
+			n += fscanf (infile, "%lg", &emiss[i][j]);
+		}
+	}
+	fgets (buff, 80, infile);		/* Eat the rest of the line. */
+	if (n != 18) {
+		printf ("Must specify exactly 18 box coordinates.\n");
+		exit(1);
+	}
+	fclose (infile);
+	/* Now sanity-check the values that were just read. */
+	for (j = 0 ; j < 3 ; j++) {
+		if (box[j] < 1.0 || box[j] >= 100.0) {
+			printf ("Box dimensions must be between 1 and 100.\n");
+			return (FALSE);
+		}
+		box[j+3] = box[j];
+		tmp1 = 0.0;
+		for (i = 0 ; i < 6 ; i++) {
+			if (rho[i][j] < 0.000 || rho[i][j] > 0.999) {
+				printf ("Reflectivities must be between .000 and .999.\n");
+				return (FALSE);
+			}
+			if (emiss[i][j] < 0.0) {
+				printf ("Emissivity cannot be negative.\n");
+				return (FALSE);
+			}
+			if (tmp1 < emiss[i][j])
+				tmp1 = emiss[i][j];
+		}
+		if (tmp1 == 0.0) {
+			printf ("Emissivities are zero.  Problem is trivial.\n");
+			return (FALSE);
+		}
+	}
+	box[6] = box[3];
+	return (TRUE);
+}
+/*****************************************************************************/
+/* The following routine decomposes the surface of a variable-sized box      */
+/* into patches that are as nearly equal in size and square as possible.     */
+/*****************************************************************************/
+Region (npatch, loop, box, place, size, area)
+int		npatch,			/* In: Problem size.                             */
+		loop[][2];		/* Out: Patch number ranges for faces.           */
+double	area[],			/* Out: 8pi * areas of the patches.              */
+		box[],			/* In: Dimensions of box in x, y, z directions.  */
+		place[][NMAX],	/* Out: Width-height-depth positions of patches. */
+		size[][NMAX];	/* Out: Width-height sizes of patches.           */
+{
+	int		icol,	/* Loop counter over the number of columns. */
+			ipatch,	/* Loop counter over the number of patches. */
+			iface,	/* Loop counter over the number of faces.   */
+			itmp1,	/* Integer temporary variables.             */
+			itmp2,	/* Integer temporary variables.             */
+			last,	/* Inner loop ending value.                 */
+			lead,	/* Inner loop starting value.               */
+			numcol,	/* Number of columns on faces.              */
+			numpat,	/* Number of patches on a face.             */
+			numrow;	/* Number of rows of patches in a column.   */
+	double	height,	/* Height of a patch within a column.       */
+			tmp1,	/* double temporary variables.              */
+			tmp2,	/* double temporary variables.              */
+			tmp3,	/* double temporary variables.              */
+			tmp4,	/* double temporary variables.              */
+			width;	/* Width of a column of patches.            */
+	/* Allocate patches to each face, proportionate to area of each face. */
+	tmp1 = 2.0 * (box[0] * box[1] + box[1] * box[2] + box[2] * box[0]);
+	tmp2 = 0.0;
+	tmp3 = npatch;
+	loop[0][0] = 0;
+	for (iface = 0 ; iface < 5 ; iface++) {
+		tmp2 = tmp2 + box[iface] * box[iface + 1];
+		loop[iface][1] = (int) (tmp3 * tmp2 / tmp1 + 0.5) - 1;
+		loop[iface + 1][0] = loop[iface][1] + 1;
+	}
+	loop[5][1] = npatch - 1;
+	/* Subdivide each face into numpat patches. */
+	for (iface = 0 ; iface < 6 ; iface++) {
+		numpat = loop[iface][1] - loop[iface][0] + 1;
+		tmp3 = 0.0;
+		if (iface >= 3)
+			tmp3 = box[iface-1];
+		numcol = (int) (sqrt(numpat * box[iface] / box[iface + 1]) + 0.5);
+		if (numcol > numpat)
+			numcol = numpat;
+		if (numcol == 0)
+			numcol = 1;
+		width = box[iface] / numcol;
+		itmp1 = numcol - 1;
+		tmp1 = 0.0;
+		for (icol = 0 ; icol < numcol ; icol++) {
+			itmp2 = itmp1 / numcol;
+			numrow = (itmp1 + numpat) / numcol - itmp2;
+			if (numrow == 0) {
+				printf ("Eccentric box requires more patches.\n");
+				return (FALSE);
+			}
+			height = box[iface + 1] / numrow;
+			tmp2 = 0.0;
+			tmp4 = width * height * (8.0 * M_PI);
+			lead = loop[iface][0] + itmp2;
+			last = lead + numrow;
+			for (ipatch = lead ; ipatch < last ; ipatch++) { 
+				size[0][ipatch] = width;
+				size[1][ipatch] = height;
+				place[0][ipatch] = tmp1;
+				place[1][ipatch] = tmp2;
+				place[2][ipatch] = tmp3;
+				area[ipatch] = tmp4;
+				tmp2 = tmp2 + height;
+			}
+			tmp1 = tmp1 + width;
+			itmp1 = itmp1 + numpat;
+		}
+	}
+	return (TRUE);
+}
+/*****************************************************************************/
+/* This routine sets up the radiosity matrix for parallel patches.           */
+/*****************************************************************************/
+void
+SetUp1 (npatch, loop, coeff, place, size)
+int		npatch,			/* In: Problem size.                             */
+		loop[][2];		/* In: Patch number ranges for faces.            */
+double	coeff[][NMAX],	/* Out: The coefficients of the eqns to solve.   */
+		place[][NMAX],		/* In: Width-height-depth positions of patches.  */
+		size[][NMAX];		/* In: Width-height sizes of patches.            */
+{
+	int		i, j, k,	/* General loop counters.                            */
+			m, n,		/* General loop counters.                            */
+			iface,		/* Loop counter over the number of faces.            */
+			ipatch,		/* Loop counter over the number of patches.          */
+			jface,		/* Face coupled to iface when computing mat. elems.  */
+			jpatch;		/* Patch coupled to ipatch when computing mat. elems.*/
+	double	d[2][2][2],	/* Point-to-point couplings between patch corners.   */
+			d2[2][2][2],/* Squares of d values, to save recomputation.       */
+			tmp1, tmp2,	/* Double temporary variables.                       */
+			tmp3, tmp4,	/* Double temporary variables.                       */
+			tmp5, tmp6,	/* Double temporary variables.                       */
+			tmp7, tmp8;	/* Double temporary variables.                       */
+	for (iface = 0 ; iface < 3 ; iface++) {
+		jface = iface + 3;
+		tmp1 = place[2][loop[jface][0]] * place[2][loop[jface][0]];
+		tmp6 = tmp1 + tmp1;
+		for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
+			for (jpatch=loop[jface][0] ; jpatch <= loop[jface][1] ; jpatch++) {
+				for (j = 0 ; j < 2 ; j++) {
+					d [0][0][j] = place[j][jpatch] - place[j][ipatch];
+					d [1][0][j] = d[0][0][j] + size[j][jpatch];
+					d [0][1][j] = d[0][0][j] - size[j][ipatch];
+					d [1][1][j] = d[1][0][j] - size[j][ipatch];
+					d2[0][0][j] = d[0][0][j] * d[0][0][j];
+					d2[1][0][j] = d[1][0][j] * d[1][0][j];
+					d2[0][1][j] = d[0][1][j] * d[0][1][j];
+					d2[1][1][j] = d[1][1][j] * d[1][1][j];
+				}
+				tmp2 = 0.0;
+				for (m = 0 ; m < 2 ; m++) {
+					for (i = 0 ; i < 2 ; i++) {
+						tmp3 = d2[m][i][1] + tmp1;
+						tmp4 = sqrt(tmp3);
+						tmp5 = 1.0 / tmp4;
+						tmp8 = 0.0;
+						for (k = 0 ; k < 2 ; k++) {
+							for (n = 0 ; n < 2 ; n++) {
+								tmp7 = d[k][n][0];
+								tmp8 = -tmp7 * atan(tmp7 * tmp5) - tmp8;
+							}
+							tmp8 = -tmp8;
+						}
+						tmp2 = -4.0 * tmp4 * tmp8 - tmp2 - tmp6 *
+						  log(((d2[1][0][0] + tmp3) * (d2[0][1][0] + tmp3)) /
+						      ((d2[0][0][0] + tmp3) * (d2[1][1][0] + tmp3)));
+					}
+					tmp2 = -tmp2;
+				}
+				for (m = 0 ; m < 2 ; m++) {
+					for (i = 0 ; i < 2 ; i++) {
+						tmp4 = sqrt(d2[m][i][0] + tmp1);
+						tmp5 = 1.0 / tmp4;
+						tmp8 = 0.0;
+						for (k = 0 ; k < 2 ; k++) {
+							for (n = 0 ; n < 2 ; n++) {
+								tmp7 = d[k][n][1];
+								tmp8 = -tmp7 * atan(tmp7 * tmp5) - tmp8;
+							}
+							tmp8 = -tmp8;
+						}
+						tmp2 = -4.0 * tmp4 * tmp8 - tmp2;
+					}
+					tmp2 = -tmp2;
+				}
+				coeff[ipatch][jpatch] = tmp2;
+				coeff[jpatch][ipatch] = tmp2;
+			}
+		}
+	}
+}
+/*****************************************************************************/
+/* This routine sets up the radiosity matrix for orthogonal patches.         */
+/*****************************************************************************/
+void
+SetUp2 (npatch, loop, coeff, place, size)
+int		npatch,			/* In: Problem size.                             */
+		loop[][2];		/* In: Patch number ranges for faces.            */
+double	coeff[][NMAX],	/* Out: The coefficients of the eqns to solve.   */
+		place[][NMAX],	/* In: Width-height-depth positions of patches.  */
+		size[][NMAX];	/* In: Width-height sizes of patches.            */
+{
+	int		m,			/* General loop counters.                            */
+			iface,		/* Loop counter over the number of faces.            */
+			ipatch,		/* Loop counter over the number of patches.          */
+			jface,		/* Face coupled to iface when computing mat. elems.  */
+			jpatch;		/* Patch coupled to ipatch when computing mat. elems.*/
+	double	tmpb, tmpa,
+			c11d, c12d, c21d, c22d, c11s, c12s, c21s, c22s,
+			d11d, d12d, d21d, d22d, d11s, d12s, d21s, d22s,
+			d11i, d12i, d21i, d22i, a10s, a20s, b01s, b02s,
+			e1111, e1211, e2111, e2211, e1112, e1212, e2112, e2212,
+			e1121, e1221, e2121, e2221, e1122, e1222, e2122, e2222;
+	for (iface = 0 ; iface < 6 ; iface++) {
+		for (m = 0 ; m < 2 ; m++) {
+			jface = (iface + m + 1) % 6;
+			for (ipatch=loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
+				a10s = place[m][ipatch] - place[2][loop[jface][0]];
+				a20s = a10s + size[m][ipatch];
+				a10s = a10s * a10s;
+				a20s = a20s * a20s;
+				for (jpatch=loop[jface][0] ; jpatch<=loop[jface][1];jpatch++) {
+					c11d = place[m][jpatch] - place[1-m][ipatch];
+					c12d = c11d + size[m][jpatch];
+					c21d = c11d - size[1-m][ipatch];
+					c22d = c12d - size[1-m][ipatch];
+					c11s = c11d * c11d;
+					c12s = c12d * c12d;
+					c21s = c21d * c21d;
+					c22s = c22d * c22d;
+					b01s = place[1 - m][jpatch] - place[2][ipatch];
+					b02s = b01s + size[1 - m][jpatch];
+					/**/
+					/* Bump the term by a small real to avoid
+					/* singularities in coupling function:
+					/**/
+					b01s = b01s * b01s + 1e-35;
+					b02s = b02s * b02s + 1e-35;
+					d11s = a10s + b01s;
+					d12s = a10s + b02s;
+					d21s = a20s + b01s;
+					d22s = a20s + b02s;
+					d11d = sqrt(d11s);
+					d12d = sqrt(d12s);
+					d21d = sqrt(d21s);
+					d22d = sqrt(d22s);
+					d11i = 1.0 / d11d;
+					d12i = 1.0 / d12d;
+					d21i = 1.0 / d21d;
+					d22i = 1.0 / d22d;
+					tmpa =	  d11d * ( c11d * atan (c11d * d11i)
+									 - c12d * atan (c12d * d11i)
+									 - c21d * atan (c21d * d11i)
+									 + c22d * atan (c22d * d11i))
+							+ d12d * (-c11d * atan (c11d * d12i)
+									 + c12d * atan (c12d * d12i)
+									 + c21d * atan (c21d * d12i)
+									 - c22d * atan (c22d * d12i))
+							+ d21d * (-c11d * atan (c11d * d21i)
+									 + c12d * atan (c12d * d21i)
+									 + c21d * atan (c21d * d21i)
+									 - c22d * atan (c22d * d21i))
+							+ d22d * ( c11d * atan (c11d * d22i)
+									 - c12d * atan (c12d * d22i)
+									 - c21d * atan (c21d * d22i)
+									 + c22d * atan (c22d * d22i));
+					e1111 = c11s + d11s;
+					e1211 = c12s + d11s;
+					e2111 = c21s + d11s;
+					e2211 = c22s + d11s;
+					e1112 = c11s + d12s;
+					e1212 = c12s + d12s;
+					e2112 = c21s + d12s;
+					e2212 = c22s + d12s;
+					e1121 = c11s + d21s;
+					e1221 = c12s + d21s;
+					e2121 = c21s + d21s;
+					e2221 = c22s + d21s;
+					e1122 = c11s + d22s;
+					e1222 = c12s + d22s;
+					e2122 = c21s + d22s;
+					e2222 = c22s + d22s;
+					tmpb =    c11s * log( e1111 * e1122 / (e1112 * e1121))
+							- c12s * log( e1211 * e1222 / (e1212 * e1221))
+							- c21s * log( e2111 * e2122 / (e2112 * e2121))
+							+ c22s * log( e2211 * e2222 / (e2212 * e2221))
+							- d11s * log( e1111 * e2211 / (e1211 * e2111))
+							+ d12s * log( e1112 * e2212 / (e1212 * e2112))
+							+ d21s * log( e1121 * e2221 / (e1221 * e2121))
+							- d22s * log( e1122 * e2222 / (e1222 * e2122));
+					coeff[ipatch][jpatch] = fabs(4.0 * tmpa + tmpb);
+					coeff[jpatch][ipatch] = coeff[ipatch][jpatch];
+				}
+			}
+		}
+	}
+}
+/*****************************************************************************/
+/* This routine sets up the radiosity matrix... normalizes row sums to 1,    */
+/* and includes terms derived from reflectivites and emissivities of faces.  */
+/*****************************************************************************/
+SetUp3 (npatch, loop, area, rho, emiss, coeff, diag, rhs)
+int		npatch,			/* In: Problem size.                                 */
+		loop[][2];		/* In: Patch number ranges for faces.                */
+double	area[],			/* In: 8 * pi * areas of the patches.                */
+		rho[][3],		/* In: (RGB) Reflectivities of the face interiors.   */
+		emiss[][3],		/* In: (RGB) Emissivities of the face interiors.     */
+		coeff[][NMAX],	/* Out: The coefficients of the eqns to solve.       */
+		diag[][NMAX],	/* Out: (RGB) Diagonal terms of the system.          */
+		rhs[][NMAX];	/* Out: (RGB) Right-hand sides of system to solve.   */
+{
+	/*
+	 *  Local variables:
+	 *    iface     Loop counter over the number of faces.
+	 *    ipatch    Outer loop counter over the number of patches.
+	 *    j         Loop counter over each color (R-G-B).
+	 *    jpatch    Inner loop counter over the number of patches.
+	 *    tmp1      double temporary variable.
+	 *    vtmp1-2   double vector temporary variables.
+	 */
+	int		j,			/* (RGB) Loop counter over each color.               */
+			iface,		/* Loop counter over the number of faces.            */
+			ipatch,		/* Outer loop counter over the number of patches.    */
+			jpatch;		/* Inner loop counter over the number of patches.    */
+	double	tmp1,		/* Double temporary variable.                        */
+			vtmp1[3],	/* Double vector temporary variables.                */
+			vtmp2[3];	/* Double vector temporary variables.                */
+	/* Ensure that row sums to 1, and put in reflectivities (rho) and        */
+	/* emissivities.                                                         */
+	for (iface = 0 ; iface < 6 ; iface++) {
+		for (j = 0 ; j < 3 ; j++) {
+          vtmp1[j] = 1.0 / rho[iface][j];
+          vtmp2[j] = emiss[iface][j] * vtmp1[j];
+		}
+		for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
+			tmp1 = 0.0;
+			for (jpatch = 0 ; jpatch < loop[iface][0] ; jpatch++) {
+				tmp1 += coeff[ipatch][jpatch];
+			}
+			for (jpatch = loop[iface][1]+1 ; jpatch < npatch ; jpatch++) {
+				tmp1 += coeff[ipatch][jpatch];
+			}
+			/* Make sure row sum (total form factor) is close to 1: */
+			if (fabs(tmp1 - area[ipatch]) > (0.5e-9 * tmp1)) {
+				printf ("Total form factor is too far from unity.\n");
+				return (FALSE);
+			}
+			tmp1 = -tmp1;
+			/* Set coplanar patch interactions to zero. */
+			for (jpatch=loop[iface][0] ; jpatch <= loop[iface][1] ; jpatch++) {
+				coeff[ipatch][jpatch] = 0.0;
+			}
+			/* Assign diagonal entries and right-hand sides. */
+			for (j = 0 ; j < 3 ; j++) {
+				diag[j][ipatch] = vtmp1[j] * tmp1;
+				rhs[j][ipatch] = vtmp2[j] * tmp1;
+			}
+		}
+	}
+	return (TRUE);
+}
+/*****************************************************************************/
+/* This routine factors and backsolves a real, symmetric, near-dense matrix  */
+/* by LDL factorization.  No pivoting; the matrix is diagonally dominant.    */
+/*****************************************************************************/
+void
+Solver (npatch, non0, coeff, diag, rhs, result)
+int		npatch,			/* In: Problem size.                                 */
+		non0;			/* In: Index of first nonzero off-diagonal mat. elem.*/
+double	coeff[][NMAX],	/* In/Out: The coefficients of the eqns to solve.    */
+		diag[][NMAX],	/* Out: (RGB) Diagonal terms of the system.          */
+		rhs[][NMAX],	/* In: (RGB) Right-hand sides of system to solve.    */
+		result[][NMAX];	/* Out: (RGB) solution radiosities.                  */
+{
+	int		i, j,		/* General loop counters.     */
+			k, m;		/* General loop counters.     */
+	double	tmp1;		/* Double temporary variable. */
+	/* Load lower triangle of coefficients, diagonal, and solution vector. */
+	for (m = 0 ; m < 3 ; m++) {
+		for (i = non0 ; i < npatch ; i++) {
+			coeff[i][i] = diag[m][i];
+			result[m][i] = rhs[m][i];
+			for (j = 0 ; j < i ; j++) {
+				coeff[i][j] = coeff[j][i];
+			}
+		}
+		/* Factor matrix, writing factors on top of original matrix. */
+		for (j = 0 ; j < non0 ; j++) {
+			coeff[j][j] = 1.0 / diag[m][j];
+			result[m][j] = rhs[m][j];
+		}
+		for (j = non0 ; j < npatch ; j++) {
+			for (k = non0 ; k < j ; k++) {
+				coeff[j][k] -= Ddot (k, &coeff[k][0], 1, &coeff[j][0], 1);
+			}
+			for (k = 0 ; k < j ; k++) {
+				tmp1 = coeff[j][k];
+				coeff[j][k] = tmp1 * coeff[k][k];
+				coeff[j][j] -= tmp1 * coeff[j][k];
+			}
+			coeff[j][j] = 1.0 / coeff[j][j];
+		}
+		/* Backsolve, in three stages (for L, D, and L transpose). */
+		for (k = non0 ; k < npatch ; k++) {
+			result[m][k] -= Ddot (k, &result[m][0], 1, &coeff[k][0], 1);
+		}
+		for (k = 0 ; k < npatch ; k++) {
+			result[m][k] *= coeff[k][k];
+		}
+		for (k = npatch - 2 ; k >= non0 ; k--) {
+			result[m][k] -= Ddot (npatch-(k+1), &result[m][k+1], 1,
+								&coeff[k+1][k], NMAX);
+		}
+		for (k = non0 - 1 ; k >= 0 ; k--) {
+			result[m][k] -= Ddot (npatch-non0, &result[m][non0], 1,
+								&coeff[non0][k], NMAX);
+		}
+	}
+}
+/*****************************************************************************/
+/* The following routine writes the answer to secondary storage.             */
+/*****************************************************************************/
+Storer (npatch, loop, place, size, result)
+int		npatch,			/* In: Problem size.                                 */
+		loop[][2];		/* In: Patch number ranges for faces.                */
+double	result[][NMAX],	/* In: (RGB) Radiosity solutions.                    */
+		place[][NMAX],	/* In: Width-height-depth positions of patches.      */
+		size[][NMAX];	/* In: Width-height sizes of patches.                */
+{
+	int		i,			/* General loop counter.                             */
+			iface,		/* Loop counter over number of faces.                */
+			ipatch;		/* Loop counter of number of patches within a face.  */
+	FILE	*outfile;	/* Output file pointer.                              */
+	/* Write patch geometry to 'answer' file. */
+	if ((outfile = fopen("answer", "w")) == NULL) {
+		printf ("Unable to open 'answer' file.\n");
+		exit (1);
+	}
+	fprintf (outfile, "%d patches:\n", npatch);
+	fprintf (outfile,
+	  " Patch  Face       Position in w, h, d              Width     Height\n");
+	for (iface = 0 ; iface < 6 ; iface++) {
+		for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
+			fprintf (outfile,
+				"%5d   %4d%11.5lf%11.5lf%11.5lf  %11.5lf%11.5lf\n",
+				ipatch+1, iface+1,
+				place[0][ipatch],
+				place[1][ipatch],
+				place[2][ipatch],
+				size[0][ipatch],
+				size[1][ipatch]);
+		}
+	}
+	/* Write patch radiosities to 'answer' file. */
+	fprintf (outfile, "\n Patch  Face  Radiosities\n");
+	for (iface = 0 ; iface < 6 ; iface++) {
+		for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
+			fprintf (outfile, "%5d   %4d%12.8lf%12.8lf%12.8lf\n",
+				ipatch+1, iface+1,
+				result[0][ipatch],
+				result[1][ipatch],
+				result[2][ipatch]);
+		}
+	}
+	fclose(outfile);
+}
+/*****************************************************************************/
+/* This routine verifies that the computed radiosities satisfy the equations.*/
+/*                                                                           */
+/*  John Gustafson, Diane Rover, Michael Carter, and Stephen Elbert          */
+/*  Ames Laboratory, 3/18/90                                                 */
+/*****************************************************************************/
+Verify (npatch, coeff, diag, rhs, result)
+int		npatch;			/* In: Problem size.                                 */
+double	coeff[][NMAX],	/* In: The coefficients of the eqns to solve.        */
+		diag[][NMAX],	/* In: (RGB) Diagonal terms of the system.           */
+		rhs[][NMAX],	/* In: (RGB) Right-hand sides of system to solve.    */
+		result[][NMAX];	/* In: (RGB) Radiosity solutions.                    */
+{
+	double	tmp1, tmp2;	/* Double temporary variables. */
+	double	anorm,		/* Norm accumulation variable. */
+			xnorm;		/* Norm accumulation variable. */
+	int		i, j, m;	/* General loop counters.      */
+	tmp1 = 0.0;
+	for (m = 0 ; m < 3 ; m++) {
+		/* Copy lower triangle of coefficients to upper triangle, */
+		/* and load diagonal.                                     */
+		for (i = 0 ; i < npatch ; i++) {
+			coeff[i][i] = diag[m][i];
+			for (j = 0 ; j < i ; j++) {
+				coeff[i][j] = coeff[j][i];
+			}
+		}
+		/* Multiply matrix by solution vector, and accum. norm of residual. */
+		anorm = xnorm = 0.0;
+		for (j = 0 ; j < npatch ; j++) {
+			tmp2 = rhs[m][j];
+			for (i = 0 ; i < npatch ; i++) {
+				tmp2 -= (coeff[j][i] * result[m][i]);
+				anorm = MAX(anorm, fabs(coeff[j][i]));
+			}
+			xnorm = MAX(xnorm, fabs(result[m][j]));
+			tmp1 += fabs(tmp2);
+		}
+	}
+	/* printf ("anorm = %g  xnorm = %g\n", anorm, xnorm); */
+	tmp1 /= (anorm * xnorm);
+	if (tmp1 > 3 * EPS) {
+		printf ("Residual is too large: %lg\n", tmp1);
+		return (FALSE);
+	}
+	return (TRUE);
+}
+#ifdef		SUN4
+/*****************************************************************************/
+/* Double precision dot product specifically written for Sun 4/370.          */
+/* By Michael Carter and John Gustafson, May 30, 1990                        */
+/* This code unrolls the dot product four ways since that's how many         */
+/* registers are available on the SPARC.  Other RISC system will require     */
+/* something very similar.  Also, unit stride is take advantage of in the    */
+/* form of special cases.                                                    */
+/*****************************************************************************/
+double
+Ddot (n, a, ia, b, ib)
+register
+int		n,		/* Number of elements in vectors.  */
+		ia,		/* Stride of a vector in ELEMENTS. */
+		ib;		/* Stride of b vector in ELEMENTS. */
+register
+double	*a,		/* Pointer to first vector.        */
+		*b;		/* Pointer to second vector.       */
+{
+	register double	sum0 = 0.0,
+					sum1 = 0.0,
+					sum2 = 0.0,
+					sum3 = 0.0;
+	register int	m = n & 3;
+	int				t;
+	/* The ragged cleanup part. */
+	while (m--) {
+		sum0 += *a * *b;
+		a += ia;
+		b += ib;
+	}
+	/* The fast pipelined part */
+	n >>= 2;
+	if (ib == 1 && ia != 1) {
+		t = ia;
+		ia = ib;
+		ib = t;
+		t = (int) a;
+		b = a;
+		a = (double *) t;
+	}
+	/* We can optimize if one or more strides are equal to 1. */
+	if (ia == 1) {
+		/* This runs if both strides are 1. */
+		if (ib == 1) {
+			ia <<= 2;
+			ib <<= 2;
+			while (n--) {
+				sum0 += a[0] * b[0];
+				sum1 += a[1] * b[1];
+				sum2 += a[2] * b[2];
+				sum3 += a[3] * b[3];
+				a += ia;
+				b += ib;
+			}
+		}
+		/* This runs if stride of a only is equal to 1. */
+		else {
+			ia <<= 2;
+			while (n--) {
+				sum0 += a[0] * *b;
+				b += ib;
+				sum1 += a[1] * *b;
+				b += ib;
+				sum2 += a[2] * *b;
+				b += ib;
+				sum3 += a[3] * *b;
+				a += ia;
+				b += ib;
+			}
+		}
+	}
+	/* This runs for the more general case.        */
+	/* This is about .5 MFLOPS slower on Sun 4/370 */
+	else {
+		while (n--) {
+			sum0 += *a * *b;
+			a += ia;
+			b += ib;
+			sum1 += *a * *b;
+			a += ia;
+			b += ib;
+			sum2 += *a * *b;
+			a += ia;
+			b += ib;
+			sum3 += *a * *b;
+			a += ia;
+			b += ib;
+		}
+	}
+	return (sum0 + sum1 + sum2 + sum3);
+}
+#else
+/*****************************************************************************/
+/* Generic double-precision dot product.  Unrolling will help pipelined      */
+/* computers.  Modify accordingly.                                           */
+/*****************************************************************************/
+double
+Ddot (n, a, ia, b, ib)
+register
+int		n,		/* Number of elements in vectors.  */
+		ia,		/* Stride of a vector in ELEMENTS. */
+		ib;		/* Stride of b vector in ELEMENTS. */
+register
+double	*a,		/* Pointer to first vector.        */
+		*b;		/* Pointer to second vector.       */
+{
+	register double sum = 0.0;
+	while (n--) {
+		sum += *a * *b;
+		a += ia;
+		b += ib;
+	}
+	return (sum);
+}
+#endif
--- a/benchmarks/slalom.input
+++ b/benchmarks/slalom.input
+40
+20
+0
--- a/benchmarks/whet.c
+++ b/benchmarks/whet.c
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+/*
+timer program -- computes total time in seconds
+since the first call. Uses constant CLOCK_RATE
+to compute of CPU time in seconds
+*/
+/* Unix clock */
+#define CLOCK_RATE 1000000.0
+/* MS-DOS Turbo C 
+#define CLOCK_RATE CLK_TCK
+*/
+float second(void);
+float second()
+{
+   return((float)clock() / CLOCK_RATE);
+}
+/* C-style global parameters */
+float T,T1,T2,E1[4];
+int J,K,L;
+void POUT(long n, long j, long k, float x1, float x2, float x3, float x4)
+{
+	printf("\n %7.1ld%7.1ld%7.1ld%12.4e%12.4e%12.4e%12.4e%8.2f",
+	n,j,k,x1,x2,x3,x4,second());
+}
+void PA(E)
+float *E;
+{
+	int j;
+	j=0;
+	do {
+		E[0]=(E[0]+E[1]+E[2]-E[3])*T;
+		E[1]=(E[0]+E[1]-E[2]+E[3])*T;
+		E[2]=(E[0]-E[1]+E[2]+E[3])*T;
+		E[3]=(-E[0]+E[1]+E[2]+E[3])/T2;
+		j=j+1;
+	}
+	while(j<6);
+}
+void P0()
+{
+	E1[J-1]=E1[K-1];
+	E1[K-1]=E1[L-1];
+	E1[L-1]=E1[J-1];
+}
+void P3(X, Y, Z)
+float *X, *Y, *Z;
+{
+	float X1, Y1;
+	X1=*X;
+	Y1=*Y;
+	X1=T*(X1+Y1);
+	Y1=T*(X1+Y1);
+	*Z=(X1+Y1)/T2;
+}
+/* equivalent description of FORTRAN-style common block ( slow !) */
+/*
+struct _comm_blk_ {
+	float _T, _T1, _T2, _E1[4];
+	int _J,_K,_L;
+} common;
+#define T common._T
+#define T1 common._T1
+#define T2 common._T2
+#define E1 common._E1
+#define J common._J
+#define K common._K
+#define L common._L
+*/
+int main()
+{
+float X1,X2,X3,X4,X,Y,Z;
+long I,ISAVE,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12;
+	printf("Start timing.");
+	I = 10;
+	T1=0.50025000;
+	T=0.499975000;
+	T2=2.0000;
+	ISAVE=I;
+	N1=0;
+	N2=12*I;
+	N3=14*I;
+	N4=348*I;
+	N5=0;
+	N6=210*I;
+	N7=32*I;
+	N8=899*I;
+	N9=516*I;
+	N10=0;
+	N11=93*I;
+	N12=0;
+	X1=1.0;
+	X2=-1.0;
+	X3=-1.0;
+	X4=-1.;
+	for(I=0; I<N1; I++)
+	{
+		X1=(X1+X2+X3-X4)*T;
+		X2=(X1+X2-X3+X4)*T;
+		X4=(-X1+X2+X3+X4)*T;
+		X3=(X1-X2+X3+X4)*T;
+	}
+	POUT(N1,N1,N1,X1,X2,X3,X4);
+	E1[0]=1.0;
+	E1[1]=-1.0;
+	E1[2]=-1.0;
+	E1[3]=-1.0;
+	for(I=0; I<N2; I++)
+	{
+		E1[0]=(E1[0]+E1[1]+E1[2]-E1[3])*T;
+		E1[1]=(E1[0]+E1[1]-E1[2]+E1[3])*T;
+		E1[2]=(E1[0]-E1[1]+E1[2]+E1[3])*T;
+		E1[3]=(-E1[0]+E1[1]+E1[2]+E1[3])*T;
+	}
+	POUT(N2,N3,N2,E1[0],E1[1],E1[2],E1[3]);
+	for(I=0; I<N3; I++) PA(E1);
+	POUT(N3,N2,N2,E1[0],E1[1],E1[2],E1[3]);
+	J=1;
+	for(I=0; I<N4; I++)
+	{
+		if(J==1) J=2;
+		else J=3;
+		if(J<2) J=0;
+		else J=1;
+		if(J<1) J=1;
+		else J=0;
+	}
+	POUT(N4,J,J,X1,X2,X3,X4);
+	J=1;
+	K=2;
+	L=3;
+	for(I=0; I<N6; I++)
+	{
+		J=J*(K-J)*(L-K);
+		K=L*K-(L-J)*K;
+		L=(L-K)*(K+J);
+		E1[L-2]=J+K+L;
+		E1[K-2]=J*K*L;
+	}
+	POUT(N6,(long)J,(long)K,E1[0],E1[1],E1[2],E1[3]);
+	X=0.5;
+	Y=0.5;
+	{
+	 register float x=X;
+	 register float y=Y;
+	 register float t2=T2;
+	 register float t=T;
+	 for(I=0; I<N7; I++)
+	 {
+		x=t*atan(t2*sin(x)*cos(x)/(cos(x+y)+cos(x-y)-1.0));
+		y=t*atan(t2*sin(y)*cos(y)/(cos(x+y)+cos(x-y)-1.0));
+	 }
+	 X=x; Y=y;
+	}
+	POUT(N7,(long)J,(long)K,X,X,Y,Y);
+	X=1.0;
+	Y=1.0;
+	Z=1.0;
+	for(I=0; I<N8; I++) P3(&X,&Y,&Z);
+	POUT(N8,(long)J,(long)K,X,Y,Z,Z);
+	J=1;
+	K=2;
+	L=3;
+	E1[0]=1.0;
+	E1[1]=2.0;
+	E1[2]=3.0;
+	for(I=0; I<N9; I++) P0();
+	POUT(N9,(long)J,(long)K,E1[0],E1[1],E1[2],E1[3]);
+	J=2;
+	K=3;
+	for(I=0; I<N10; I++)
+	{
+		J+=K;
+		K+=J;
+		J-=K;
+		K-=J+J;
+	}
+	POUT(N10,(long)J,(long)K,X1,X2,X3,X4);
+	X=0.75;
+	{
+	 register float x=X;
+	 register float t1=T1;
+	 for(I=0; I<N11; I++) 	x=sqrt(exp(log(x)/t1));
+	 X=x;
+	}
+	POUT(N11,(long)J,(long)K,X,X,X,X);
+	printf("\n %g whetstones per second\n", 1.0e+08/second());
+}