Commit 38c72a01 authored by Jayke Meijer's avatar Jayke Meijer

Added benchmark C files.

parent ab223369
CC=xgcc
CFLAGS=-O0
%.s: %.c
$(CC) $(CFLAGS) -S $<
%.o: %.s
$(CC) $(CFLAGS) -c $<
%: %.o
$(CC) $(CFLAGS) $<
all: acron clinpack dhrystone pi slalom whet
asm: acron.s clinpack.s dhrystone.s pi.s slalom.s whet.s
clean:
rm -f acron clinpack dhrystone pi slalom whet
rm -f *.o
rm -f *.s
rm -f *.cycles *.output
#include <stdio.h>
#define N 6
char *w[] = {"Vertalerbouw", "Ertalerbouw", "Practicum", "Optimization", "Peephole", "Eephole"};
char acron[N*2], command[100];
int done[N], pindex[N+1];
int is_vowel(char c)
{
return (c==65 || c==69 || c==73 || c==79 || c==85 || c==89)? 1 : 0;
}
void do_perm(int n, int done[], int index, int size)
{
int j, i, nrv = 0, k;
if (index == 1 && (!is_vowel(w[pindex[0]][0]) && !is_vowel(w[n][0])))
return;
if (index > 1) {
nrv = is_vowel(w[pindex[index-2]][0]) +
is_vowel(w[pindex[index-1]][0]) +
is_vowel(w[n][0]);
if (nrv == 0 || nrv == 3)
return;
}
pindex[index++] = n;
if (index < N && --size) {
for (j = 0; j<N; j++) {
if (done[j] == 0) {
done[j] = 1;
do_perm(j, done, index, size);
done[j] = 0;
}
}
} else {
k = 0;
for (i=0; i < index; i++) {
int t = 0;
while (isupper(w[pindex[i]][t]))
acron[k++] = w[pindex[i]][t++];
}
acron[k] = 0;
printf("%s", acron);
for (i=0; i < index; i++)
printf(" %s", w[pindex[i]]);
printf("\n");
/* fflush(stdout); */
}
}
int main()
{
int i, j;
for (j = 4; j <= N; j++) {
for (i = 0; i < N; i++) {
done[i] = 1;
do_perm(i, done, 0, j);
done[i] = 0;
}
}
}
/*
Translated to C by Bonnie Toy 5/88
You MUST specify one of -DSP or -DDP to compile correctly.
You MUST specify one of -DROLL or -DUNROLL to compile correctly.
You MUST specify a timer option(see below) to compile correctly.
To compile double precision version for Sun-4:
cc -DUNIX -DDP -DROLL -O4 clinpack.c
To compile single precision version for Sun-4:
cc -DUNIX -DSP -DROLL -O4 -fsingle -fsingle2 clinpack.c
To obtain rolled source BLAS, add -DROLL to the command lines.
To obtain unrolled source BLAS, add -DUNROLL to the command lines.
PLEASE NOTE: You can also just 'uncomment' one of the options below.
*/
/* #define SP */
#define DP
/*#define ROLL */
#define UNROLL
/***************************************************************/
/* Timer options. You MUST uncomment one of the options below */
/* or compile, for example, with the '-DUNIX' option. */
/***************************************************************/
/* #define Amiga */
#define UNIX
/* #define UNIX_Old */
/* #define VMS */
/* #define BORLAND_C */
/* #define MSC */
/* #define MAC */
/* #define IPSC */
/* #define FORTRAN_SEC */
/* #define GTODay */
/* #define CTimer */
/* #define UXPM */
#include <stdio.h>
#include <math.h>
#ifdef SP
#define REAL float
#define ZERO 0.0
#define ONE 1.0
#define PREC "Single "
#endif
#ifdef DP
#define REAL double
#define ZERO 0.0e0
#define ONE 1.0e0
#define PREC "Double "
#endif
#define NTIMES 1
#ifdef ROLL
#define ROLLING "Rolled "
#endif
#ifdef UNROLL
#define ROLLING "Unrolled "
#endif
static double st[8][6];
main ()
{
static REAL aa[200][200],a[200][201],b[200],x[200];
REAL cray,ops,total,norma,normx;
REAL resid,residn,eps;
REAL epslon(),kf;
double t1,tm,tm2,dtime();
static int ipvt[200],n,i,ntimes,info,lda,ldaa,kflops;
lda = 201;
ldaa = 200;
cray = .056;
n = 25;
printf(ROLLING); printf(PREC);
printf("Precision Linpack\n\n");
ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n);
matgen(a,lda,n,b,&norma);
t1 = dtime();
dgefa(a,lda,n,ipvt,&info);
st[0][0] = dtime() - t1;
t1 = dtime();
dgesl(a,lda,n,ipvt,b,0);
st[1][0] = dtime() - t1;
total = st[0][0] + st[1][0];
/* compute a residual to verify results. */
for (i = 0; i < n; i++)
{
x[i] = b[i];
}
matgen(a,lda,n,b,&norma);
for (i = 0; i < n; i++)
{
b[i] = -b[i];
}
dmxpy(n,b,n,lda,x,a);
resid = 0.0;
normx = 0.0;
for (i = 0; i < n; i++)
{
resid = (resid > fabs((double)b[i]))
? resid : fabs((double)b[i]);
normx = (normx > fabs((double)x[i]))
? normx : fabs((double)x[i]);
}
eps = epslon((REAL)ONE);
residn = resid/( n*norma*normx*eps );
printf(" norm. resid resid machep");
printf(" x[0]-1 x[n-1]-1\n");
printf("%8.1f %16.8e%16.8e%16.8e%16.8e\n",
(double)residn, (double)resid, (double)eps,
(double)x[0]-1, (double)x[n-1]-1);
printf(" times are reported for matrices of order %5d\n",n);
printf(" dgefa dgesl total kflops unit");
printf(" ratio\n");
st[2][0] = total;
st[3][0] = ops/(1.0e3*total);
st[4][0] = 2.0e3/st[3][0];
st[5][0] = total/cray;
printf(" times for array with leading dimension of%5d\n",lda);
print_time(0);
matgen(a,lda,n,b,&norma);
t1 = dtime();
dgefa(a,lda,n,ipvt,&info);
st[0][1] = dtime() - t1;
t1 = dtime();
dgesl(a,lda,n,ipvt,b,0);
st[1][1] = dtime() - t1;
total = st[0][1] + st[1][1];
st[2][1] = total;
st[3][1] = ops/(1.0e3*total);
st[4][1] = 2.0e3/st[3][1];
st[5][1] = total/cray;
matgen(a,lda,n,b,&norma);
t1 = dtime();
dgefa(a,lda,n,ipvt,&info);
st[0][2] = dtime() - t1;
t1 = dtime();
dgesl(a,lda,n,ipvt,b,0);
st[1][2] = dtime() - t1;
total = st[0][2] + st[1][2];
st[2][2] = total;
st[3][2] = ops/(1.0e3*total);
st[4][2] = 2.0e3/st[3][2];
st[5][2] = total/cray;
ntimes = NTIMES;
tm2 = 0.0;
t1 = dtime();
for (i = 0; i < ntimes; i++) {
tm = dtime();
matgen(a,lda,n,b,&norma);
tm2 = tm2 + dtime() - tm;
dgefa(a,lda,n,ipvt,&info);
}
st[0][3] = (dtime() - t1 - tm2)/ntimes;
t1 = dtime();
for (i = 0; i < ntimes; i++) {
dgesl(a,lda,n,ipvt,b,0);
}
st[1][3] = (dtime() - t1)/ntimes;
total = st[0][3] + st[1][3];
st[2][3] = total;
st[3][3] = ops/(1.0e3*total);
st[4][3] = 2.0e3/st[3][3];
st[5][3] = total/cray;
print_time(1);
print_time(2);
print_time(3);
matgen(aa,ldaa,n,b,&norma);
t1 = dtime();
dgefa(aa,ldaa,n,ipvt,&info);
st[0][4] = dtime() - t1;
t1 = dtime();
dgesl(aa,ldaa,n,ipvt,b,0);
st[1][4] = dtime() - t1;
total = st[0][4] + st[1][4];
st[2][4] = total;
st[3][4] = ops/(1.0e3*total);
st[4][4] = 2.0e3/st[3][4];
st[5][4] = total/cray;
matgen(aa,ldaa,n,b,&norma);
t1 = dtime();
dgefa(aa,ldaa,n,ipvt,&info);
st[0][5] = dtime() - t1;
t1 = dtime();
dgesl(aa,ldaa,n,ipvt,b,0);
st[1][5] = dtime() - t1;
total = st[0][5] + st[1][5];
st[2][5] = total;
st[3][5] = ops/(1.0e3*total);
st[4][5] = 2.0e3/st[3][5];
st[5][5] = total/cray;
matgen(aa,ldaa,n,b,&norma);
t1 = dtime();
dgefa(aa,ldaa,n,ipvt,&info);
st[0][6] = dtime() - t1;
t1 = dtime();
dgesl(aa,ldaa,n,ipvt,b,0);
st[1][6] = dtime() - t1;
total = st[0][6] + st[1][6];
st[2][6] = total;
st[3][6] = ops/(1.0e3*total);
st[4][6] = 2.0e3/st[3][6];
st[5][6] = total/cray;
ntimes = NTIMES;
tm2 = 0;
t1 = dtime();
for (i = 0; i < ntimes; i++) {
tm = dtime();
matgen(aa,ldaa,n,b,&norma);
tm2 = tm2 + dtime() - tm;
dgefa(aa,ldaa,n,ipvt,&info);
}
st[0][7] = (dtime() - t1 - tm2)/ntimes;
t1 = dtime();
for (i = 0; i < ntimes; i++) {
dgesl(aa,ldaa,n,ipvt,b,0);
}
st[1][7] = (dtime() - t1)/ntimes;
total = st[0][7] + st[1][7];
st[2][7] = total;
st[3][7] = ops/(1.0e3*total);
st[4][7] = 2.0e3/st[3][7];
st[5][7] = total/cray;
/* the following code sequence implements the semantics of
the Fortran intrinsics "nint(min(st[3][3],st[3][7]))" */
/*
kf = (st[3][3] < st[3][7]) ? st[3][3] : st[3][7];
kf = (kf > ZERO) ? (kf + .5) : (kf - .5);
if (fabs((double)kf) < ONE)
kflops = 0;
else {
kflops = floor(fabs((double)kf));
if (kf < ZERO) kflops = -kflops;
}
*/
if ( st[3][3] < ZERO ) st[3][3] = ZERO;
if ( st[3][7] < ZERO ) st[3][7] = ZERO;
kf = st[3][3];
if ( st[3][7] < st[3][3] ) kf = st[3][7];
kflops = (int)(kf + 0.5);
printf(" times for array with leading dimension of%4d\n",ldaa);
print_time(4);
print_time(5);
print_time(6);
print_time(7);
printf(ROLLING); printf(PREC);
printf(" Precision %5d Kflops ; %d Reps \n",kflops,NTIMES);
}
/*----------------------*/
print_time (row)
int row;
{
printf("%11.2f%11.2f%11.2f%11.0f%11.2f%11.2f\n",
(double)st[0][row], (double)st[1][row], (double)st[2][row],
(double)st[3][row], (double)st[4][row], (double)st[5][row]);
}
/*----------------------*/
matgen(a,lda,n,b,norma)
REAL a[],b[],*norma;
int lda, n;
/* We would like to declare a[][lda], but c does not allow it. In this
function, references to a[i][j] are written a[lda*i+j]. */
{
int init, i, j;
init = 1325;
*norma = 0.0;
for (j = 0; j < n; j++) {
for (i = 0; i < n; i++) {
init = 3125*init % 65536;
a[lda*j+i] = (init - 32768.0)/16384.0;
*norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma;
}
}
for (i = 0; i < n; i++) {
b[i] = 0.0;
}
for (j = 0; j < n; j++) {
for (i = 0; i < n; i++) {
b[i] = b[i] + a[lda*j+i];
}
}
}
/*----------------------*/
dgefa(a,lda,n,ipvt,info)
REAL a[];
int lda,n,ipvt[],*info;
/* We would like to declare a[][lda], but c does not allow it. In this
function, references to a[i][j] are written a[lda*i+j].
*/
/*
dgefa factors a double precision matrix by gaussian elimination.
dgefa is usually called by dgeco, but it can be called
directly with a saving in time if rcond is not needed.
(time for dgeco) = (1 + 9/n)*(time for dgefa) .
on entry
a REAL precision[n][lda]
the matrix to be factored.
lda integer
the leading dimension of the array a .
n integer
the order of the matrix a .
on return
a an upper triangular matrix and the multipliers
which were used to obtain it.
the factorization can be written a = l*u where
l is a product of permutation and unit lower
triangular matrices and u is upper triangular.
ipvt integer[n]
an integer vector of pivot indices.
info integer
= 0 normal value.
= k if u[k][k] .eq. 0.0 . this is not an error
condition for this subroutine, but it does
indicate that dgesl or dgedi will divide by zero
if called. use rcond in dgeco for a reliable
indication of singularity.
linpack. this version dated 08/14/78 .
cleve moler, university of new mexico, argonne national lab.
functions
blas daxpy,dscal,idamax
*/
{
/* internal variables */
REAL t;
int idamax(),j,k,kp1,l,nm1;
/* gaussian elimination with partial pivoting */
*info = 0;
nm1 = n - 1;
if (nm1 >= 0) {
for (k = 0; k < nm1; k++) {
kp1 = k + 1;
/* find l = pivot index */
l = idamax(n-k,&a[lda*k+k],1) + k;
ipvt[k] = l;
/* zero pivot implies this column already
triangularized */
if (a[lda*k+l] != ZERO) {
/* interchange if necessary */
if (l != k) {
t = a[lda*k+l];
a[lda*k+l] = a[lda*k+k];
a[lda*k+k] = t;
}
/* compute multipliers */
t = -ONE/a[lda*k+k];
dscal(n-(k+1),t,&a[lda*k+k+1],1);
/* row elimination with column indexing */
for (j = kp1; j < n; j++) {
t = a[lda*j+l];
if (l != k) {
a[lda*j+l] = a[lda*j+k];
a[lda*j+k] = t;
}
daxpy(n-(k+1),t,&a[lda*k+k+1],1,
&a[lda*j+k+1],1);
}
}
else {
*info = k;
}
}
}
ipvt[n-1] = n-1;
if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1;
}
/*----------------------*/
dgesl(a,lda,n,ipvt,b,job)
int lda,n,ipvt[],job;
REAL a[],b[];
/* We would like to declare a[][lda], but c does not allow it. In this
function, references to a[i][j] are written a[lda*i+j]. */
/*
dgesl solves the double precision system
a * x = b or trans(a) * x = b
using the factors computed by dgeco or dgefa.
on entry
a double precision[n][lda]
the output from dgeco or dgefa.
lda integer
the leading dimension of the array a .
n integer
the order of the matrix a .
ipvt integer[n]
the pivot vector from dgeco or dgefa.
b double precision[n]
the right hand side vector.
job integer
= 0 to solve a*x = b ,
= nonzero to solve trans(a)*x = b where
trans(a) is the transpose.
on return
b the solution vector x .
error condition
a division by zero will occur if the input factor contains a
zero on the diagonal. technically this indicates singularity
but it is often caused by improper arguments or improper
setting of lda . it will not occur if the subroutines are
called correctly and if dgeco has set rcond .gt. 0.0
or dgefa has set info .eq. 0 .
to compute inverse(a) * c where c is a matrix
with p columns
dgeco(a,lda,n,ipvt,rcond,z)
if (!rcond is too small){
for (j=0,j<p,j++)
dgesl(a,lda,n,ipvt,c[j][0],0);
}
linpack. this version dated 08/14/78 .
cleve moler, university of new mexico, argonne national lab.
functions
blas daxpy,ddot
*/
{
/* internal variables */
REAL ddot(),t;
int k,kb,l,nm1;
nm1 = n - 1;
if (job == 0) {
/* job = 0 , solve a * x = b
first solve l*y = b */
if (nm1 >= 1) {
for (k = 0; k < nm1; k++) {
l = ipvt[k];
t = b[l];
if (l != k){
b[l] = b[k];
b[k] = t;
}
daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1);
}
}
/* now solve u*x = y */
for (kb = 0; kb < n; kb++) {
k = n - (kb + 1);
b[k] = b[k]/a[lda*k+k];
t = -b[k];
daxpy(k,t,&a[lda*k+0],1,&b[0],1);
}
}
else {
/* job = nonzero, solve trans(a) * x = b
first solve trans(u)*y = b */
for (k = 0; k < n; k++) {
t = ddot(k,&a[lda*k+0],1,&b[0],1);
b[k] = (b[k] - t)/a[lda*k+k];
}
/* now solve trans(l)*x = y */
if (nm1 >= 1) {
for (kb = 1; kb < nm1; kb++) {
k = n - (kb+1);
b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1);
l = ipvt[k];
if (l != k) {
t = b[l];
b[l] = b[k];
b[k] = t;
}
}
}
}
}
/*----------------------*/
daxpy(n,da,dx,incx,dy,incy)
/*
constant times a vector plus a vector.
jack dongarra, linpack, 3/11/78.
*/
REAL dx[],dy[],da;
int incx,incy,n;
{
int i,ix,iy,m,mp1;
if(n <= 0) return;
if (da == ZERO) return;
if(incx != 1 || incy != 1) {
/* code for unequal increments or equal increments
not equal to 1 */
ix = 1;
iy = 1;
if(incx < 0) ix = (-n+1)*incx + 1;
if(incy < 0) iy = (-n+1)*incy + 1;
for (i = 0;i < n; i++) {
dy[iy] = dy[iy] + da*dx[ix];
ix = ix + incx;
iy = iy + incy;
}
return;
}
/* code for both increments equal to 1 */
#ifdef ROLL
for (i = 0;i < n; i++) {
dy[i] = dy[i] + da*dx[i];
}
#endif
#ifdef UNROLL
m = n % 4;
if ( m != 0) {
for (i = 0; i < m; i++)
dy[i] = dy[i] + da*dx[i];
if (n < 4) return;
}
for (i = m; i < n; i = i + 4) {
dy[i] = dy[i] + da*dx[i];
dy[i+1] = dy[i+1] + da*dx[i+1];
dy[i+2] = dy[i+2] + da*dx[i+2];
dy[i+3] = dy[i+3] + da*dx[i+3];
}
#endif
}
/*----------------------*/
REAL ddot(n,dx,incx,dy,incy)
/*
forms the dot product of two vectors.
jack dongarra, linpack, 3/11/78.
*/
REAL dx[],dy[];
int incx,incy,n;
{
REAL dtemp;
int i,ix,iy,m,mp1;
dtemp = ZERO;
if(n <= 0) return(ZERO);
if(incx != 1 || incy != 1) {
/* code for unequal increments or equal increments
not equal to 1 */
ix = 0;
iy = 0;
if (incx < 0) ix = (-n+1)*incx;
if (incy < 0) iy = (-n+1)*incy;
for (i = 0;i < n; i++) {
dtemp = dtemp + dx[ix]*dy[iy];
ix = ix + incx;
iy = iy + incy;
}
return(dtemp);
}
/* code for both increments equal to 1 */
#ifdef ROLL
for (i=0;i < n; i++)
dtemp = dtemp + dx[i]*dy[i];
return(dtemp);
#endif
#ifdef UNROLL
m = n % 5;
if (m != 0) {
for (i = 0; i < m; i++)
dtemp = dtemp + dx[i]*dy[i];
if (n < 5) return(dtemp);
}
for (i = m; i < n; i = i + 5) {
dtemp = dtemp + dx[i]*dy[i] +
dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] +
dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4];
}
return(dtemp);
#endif
}
/*----------------------*/
dscal(n,da,dx,incx)
/* scales a vector by a constant.
jack dongarra, linpack, 3/11/78.
*/
REAL da,dx[];
int n, incx;
{
int i,m,mp1,nincx;
if(n <= 0)return;
if(incx != 1) {
/* code for increment not equal to 1 */
nincx = n*incx;
for (i = 0; i < nincx; i = i + incx)
dx[i] = da*dx[i];
return;
}
/* code for increment equal to 1 */
#ifdef ROLL
for (i = 0; i < n; i++)
dx[i] = da*dx[i];
#endif
#ifdef UNROLL
m = n % 5;
if (m != 0) {
for (i = 0; i < m; i++)
dx[i] = da*dx[i];
if (n < 5) return;
}
for (i = m; i < n; i = i + 5){
dx[i] = da*dx[i];
dx[i+1] = da*dx[i+1];
dx[i+2] = da*dx[i+2];
dx[i+3] = da*dx[i+3];
dx[i+4] = da*dx[i+4];
}
#endif
}
/*----------------------*/
int idamax(n,dx,incx)
/*
finds the index of element having max. absolute value.
jack dongarra, linpack, 3/11/78.
*/
REAL dx[];
int incx,n;
{
REAL dmax;
int i, ix, itemp;
if( n < 1 ) return(-1);
if(n ==1 ) return(0);
if(incx != 1) {
/* code for increment not equal to 1 */
ix = 1;
dmax = fabs((double)dx[0]);
ix = ix + incx;
for (i = 1; i < n; i++) {
if(fabs((double)dx[ix]) > dmax) {
itemp = i;
dmax = fabs((double)dx[ix]);
}
ix = ix + incx;
}
}
else {
/* code for increment equal to 1 */
itemp = 0;
dmax = fabs((double)dx[0]);
for (i = 1; i < n; i++) {
if(fabs((double)dx[i]) > dmax) {
itemp = i;
dmax = fabs((double)dx[i]);
}
}
}
return (itemp);
}
/*----------------------*/
REAL epslon (x)
REAL x;
/*
estimate unit roundoff in quantities of size x.
*/
{
REAL a,b,c,eps;
/*
this program should function properly on all systems
satisfying the following two assumptions,
1. the base used in representing dfloating point
numbers is not a power of three.
2. the quantity a in statement 10 is represented to
the accuracy used in dfloating point variables
that are stored in memory.
the statement number 10 and the go to 10 are intended to
force optimizing compilers to generate code satisfying
assumption 2.
under these assumptions, it should be true that,
a is not exactly equal to four-thirds,
b has a zero for its last bit or digit,
c is not exactly equal to one,
eps measures the separation of 1.0 from
the next larger dfloating point number.
the developers of eispack would appreciate being informed
about any systems where these assumptions do not hold.
*****************************************************************
this routine is one of the auxiliary routines used by eispack iii
to avoid machine dependencies.
*****************************************************************
this version dated 4/6/83.
*/
a = 4.0e0/3.0e0;
eps = ZERO;
while (eps == ZERO) {
b = a - ONE;
c = b + b + b;
eps = fabs((double)(c-ONE));
}
return(eps*fabs((double)x));
}
/*----------------------*/
dmxpy (n1, y, n2, ldm, x, m)
REAL y[], x[], m[];
int n1, n2, ldm;
/* We would like to declare m[][ldm], but c does not allow it. In this
function, references to m[i][j] are written m[ldm*i+j]. */
/*
purpose:
multiply matrix m times vector x and add the result to vector y.
parameters:
n1 integer, number of elements in vector y, and number of rows in
matrix m
y double [n1], vector of length n1 to which is added
the product m*x
n2 integer, number of elements in vector x, and number of columns
in matrix m
ldm integer, leading dimension of array m
x double [n2], vector of length n2
m double [ldm][n2], matrix of n1 rows and n2 columns
----------------------------------------------------------------------
*/
{
int j,i,jmin;
/* cleanup odd vector */
j = n2 % 2;
if (j >= 1) {
j = j - 1;
for (i = 0; i < n1; i++)
y[i] = (y[i]) + x[j]*m[ldm*j+i];
}
/* cleanup odd group of two vectors */
j = n2 % 4;
if (j >= 2) {
j = j - 1;
for (i = 0; i < n1; i++)
y[i] = ( (y[i])
+ x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
}
/* cleanup odd group of four vectors */
j = n2 % 8;
if (j >= 4) {
j = j - 1;
for (i = 0; i < n1; i++)
y[i] = ((( (y[i])
+ x[j-3]*m[ldm*(j-3)+i])
+ x[j-2]*m[ldm*(j-2)+i])
+ x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
}
/* cleanup odd group of eight vectors */
j = n2 % 16;
if (j >= 8) {
j = j - 1;
for (i = 0; i < n1; i++)
y[i] = ((((((( (y[i])
+ x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i])
+ x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i])
+ x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i])
+ x[j-1]*m[ldm*(j-1)+i]) + x[j] *m[ldm*j+i];
}
/* main loop - groups of sixteen vectors */
jmin = (n2%16)+16;
for (j = jmin-1; j < n2; j = j + 16) {
for (i = 0; i < n1; i++)
y[i] = ((((((((((((((( (y[i])
+ x[j-15]*m[ldm*(j-15)+i])
+ x[j-14]*m[ldm*(j-14)+i])
+ x[j-13]*m[ldm*(j-13)+i])
+ x[j-12]*m[ldm*(j-12)+i])
+ x[j-11]*m[ldm*(j-11)+i])
+ x[j-10]*m[ldm*(j-10)+i])
+ x[j- 9]*m[ldm*(j- 9)+i])
+ x[j- 8]*m[ldm*(j- 8)+i])
+ x[j- 7]*m[ldm*(j- 7)+i])
+ x[j- 6]*m[ldm*(j- 6)+i])
+ x[j- 5]*m[ldm*(j- 5)+i])
+ x[j- 4]*m[ldm*(j- 4)+i])
+ x[j- 3]*m[ldm*(j- 3)+i])
+ x[j- 2]*m[ldm*(j- 2)+i])
+ x[j- 1]*m[ldm*(j- 1)+i])
+ x[j] *m[ldm*j+i];
}
}
/*****************************************************/
/* Various timer routines. */
/* Al Aburto, aburto@marlin.nosc.mil, 26 Sep 1992 */
/* */
/* t = dtime() outputs the current time in seconds. */
/* Use CAUTION as some of these routines will mess */
/* up when timing across the hour mark!!! */
/* */
/* For timing I use the 'user' time whenever */
/* possible. Using 'user+sys' time is a separate */
/* issue. */
/* */
/*****************************************************/
/*********************************/
/* Timer code. */
/*********************************/
/*******************/
/* Amiga dtime() */
/*******************/
#ifdef Amiga
#include <ctype.h>
#define HZ 50
double dtime()
{
double q;
struct tt {
long days;
long minutes;
long ticks;
} tt;
DateStamp(&tt);
q = ((double)(tt.ticks + (tt.minutes * 60L * 50L))) / (double)HZ;
return q;
}
#endif
/*****************************************************/
/* UNIX dtime(). This is the preferred UNIX timer. */
/* Provided by: Markku Kolkka, mk59200@cc.tut.fi */
/* HP-UX Addition by: Bo Thide', bt@irfu.se */
/*****************************************************/
#ifdef UNIX
#include <sys/time.h>
#include <sys/resource.h>
#ifdef __hpux
#include <sys/syscall.h>
#define getrusage(a,b) syscall(SYS_getrusage,a,b)
#endif
struct rusage rusage;
double dtime()
{
double q;
getrusage(RUSAGE_SELF,&rusage);
q = (double)(rusage.ru_utime.tv_sec);
q = q + (double)(rusage.ru_utime.tv_usec) * 1.0e-06;
return q;
}
#endif
/***************************************************/
/* UNIX_Old dtime(). This is the old UNIX timer. */
/* Use only if absolutely necessary as HZ may be */
/* ill defined on your system. */
/***************************************************/
#ifdef UNIX_Old
#include <sys/types.h>
#include <sys/times.h>
#include <sys/param.h>
#ifndef HZ
#define HZ 60
#endif
struct tms tms;
double dtime()
{
double q;
times(&tms);
q = (double)(tms.tms_utime) / (double)HZ;
return q;
}
#endif
/*********************************************************/
/* VMS dtime() for VMS systems. */
/* Provided by: RAMO@uvphys.phys.UVic.CA */
/* Some people have run into problems with this timer. */
/*********************************************************/
#ifdef VMS
#include time
#ifndef HZ
#define HZ 100
#endif
struct tbuffer_t
{
int proc_user_time;
int proc_system_time;
int child_user_time;
int child_system_time;
};
struct tbuffer_t tms;
double dtime()
{
double q;
times(&tms);
q = (double)(tms.proc_user_time) / (double)HZ;
return q;
}
#endif
/******************************/
/* BORLAND C dtime() for DOS */
/******************************/
#ifdef BORLAND_C
#include <ctype.h>
#include <dos.h>
#include <time.h>
#define HZ 100
struct time tnow;
double dtime()
{
double q;
gettime(&tnow);
q = 60.0 * (double)(tnow.ti_min);
q = q + (double)(tnow.ti_sec);
q = q + (double)(tnow.ti_hund)/(double)HZ;
return q;
}
#endif
/**************************************/
/* Microsoft C (MSC) dtime() for DOS */
/**************************************/
#ifdef MSC
#include <time.h>
#include <ctype.h>
#define HZ CLK_TCK
clock_t tnow;
double dtime()
{
double q;
tnow = clock();
q = (double)tnow / (double)HZ;
return q;
}
#endif
/*************************************/
/* Macintosh (MAC) Think C dtime() */
/*************************************/
#ifdef MAC
#include <time.h>
#define HZ 60
double dtime()
{
double q;
q = (double)clock() / (double)HZ;
return q;
}
#endif
/************************************************************/
/* iPSC/860 (IPSC) dtime() for i860. */
/* Provided by: Dan Yergeau, yergeau@gloworm.Stanford.EDU */
/************************************************************/
#ifdef IPSC
extern double dclock();
double dtime()
{
double q;
q = dclock();
return q;
}
#endif
/**************************************************/
/* FORTRAN dtime() for Cray type systems. */
/* This is the preferred timer for Cray systems. */
/**************************************************/
#ifdef FORTRAN_SEC
fortran double second();
double dtime()
{
double q;
second(&q);
return q;
}
#endif
/***********************************************************/
/* UNICOS C dtime() for Cray UNICOS systems. Don't use */
/* unless absolutely necessary as returned time includes */
/* 'user+system' time. Provided by: R. Mike Dority, */
/* dority@craysea.cray.com */
/***********************************************************/
#ifdef CTimer
#include <time.h>
double dtime()
{
double q;
clock_t t;
t = clock();
q = (double)t / (double)CLOCKS_PER_SEC;
return q;
}
#endif
/********************************************/
/* Another UNIX timer using gettimeofday(). */
/* However, getrusage() is preferred. */
/********************************************/
#ifdef GTODay
#include <sys/time.h>
struct timeval tnow;
double dtime()
{
double q;
gettimeofday(&tnow,NULL);
q = (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6;
return q;
}
#endif
/*****************************************************/
/* Fujitsu UXP/M timer. */
/* Provided by: Mathew Lim, ANUSF, M.Lim@anu.edu.au */
/*****************************************************/
#ifdef UXPM
#include <sys/types.h>
#include <sys/timesu.h>
struct tmsu rusage;
double dtime()
{
double q;
timesu(&rusage);
q = (double)(rusage.tms_utime) * 1.0e-06;
return q;
}
#endif
/* EVERBODY: Please read "APOLOGY" below. -rick 01/06/86
*
* "DHRYSTONE" Benchmark Program
*
* Version: C/1.1, 12/01/84
*
* Date: PROGRAM updated 01/06/86, RESULTS updated 02/17/86
*
* Author: Reinhold P. Weicker, CACM Vol 27, No 10, 10/84 pg. 1013
* Translated from ADA by Rick Richardson
* Every method to preserve ADA-likeness has been used,
* at the expense of C-ness.
*
* Compile: cc -O dry.c -o drynr : No registers
* cc -O -DREG=register dry.c -o dryr : Registers
*
* Defines: Defines are provided for old C compiler's
* which don't have enums, and can't assign structures.
* The time(2) function is library dependant; Most
* return the time in seconds, but beware of some, like
* Aztec C, which return other units.
* The LOOPS define is initially set for 50000 loops.
* If you have a machine with large integers and is
* very fast, please change this number to 500000 to
* get better accuracy. Please select the way to
* measure the execution time using the TIME define.
* For single user machines, time(2) is adequate. For
* multi-user machines where you cannot get single-user
* access, use the times(2) function. If you have
* neither, use a stopwatch in the dead of night.
* Use a "printf" at the point marked "start timer"
* to begin your timings. DO NOT use the UNIX "time(1)"
* command, as this will measure the total time to
* run this program, which will (erroneously) include
* the time to malloc(3) storage and to compute the
* time it takes to do nothing.
*
* Run: drynr; dryr
*
* Results: If you get any new machine/OS results, please send to:
*
* {ihnp4,vax135,..}!houxm!castor!pcrat!rick
*
* and thanks to all that do. Space prevents listing
* the names of those who have provided some of these
* results. I'll be forwarding these results to
* Rheinhold Weicker.
*
* Note: I order the list in increasing performance of the
* "with registers" benchmark. If the compiler doesn't
* provide register variables, then the benchmark
* is the same for both REG and NOREG.
*
* PLEASE: Send complete information about the machine type,
* clock speed, OS and C manufacturer/version. If
* the machine is modified, tell me what was done.
* On UNIX, execute uname -a and cc -V to get this info.
*
* 80x8x NOTE: 80x8x benchers: please try to do all memory models
* for a particular compiler.
*
* APOLOGY (1/30/86):
* Well, I goofed things up! As pointed out by Haakon Bugge,
* the line of code marked "GOOF" below was missing from the
* Dhrystone distribution for the last several months. It
* *WAS* in a backup copy I made last winter, so no doubt it
* was victimized by sleepy fingers operating vi!
*
* The effect of the line missing is that the reported benchmarks
* are 15% too fast (at least on a 80286). Now, this creates
* a dilema - do I throw out ALL the data so far collected
* and use only results from this (corrected) version, or
* do I just keep collecting data for the old version?
*
* Since the data collected so far *is* valid as long as it
* is compared with like data, I have decided to keep
* TWO lists- one for the old benchmark, and one for the
* new. This also gives me an opportunity to correct one
* other error I made in the instructions for this benchmark.
* My experience with C compilers has been mostly with
* UNIX 'pcc' derived compilers, where the 'optimizer' simply
* fixes sloppy code generation (peephole optimization).
* But today, there exist C compiler optimizers that will actually
* perform optimization in the Computer Science sense of the word,
* by removing, for example, assignments to a variable whose
* value is never used. Dhrystone, unfortunately, provides
* lots of opportunities for this sort of optimization.
*
* I request that benchmarkers re-run this new, corrected
* version of Dhrystone, turning off or bypassing optimizers
* which perform more than peephole optimization. Please
* indicate the version of Dhrystone used when reporting the
* results to me.
*
* RESULTS BEGIN HERE
*
*----------------DHRYSTONE VERSION 1.1 RESULTS BEGIN--------------------------
*
* MACHINE MICROPROCESSOR OPERATING COMPILER DHRYSTONES/SEC.
* TYPE SYSTEM NO REG REGS
* -------------------------- ------------ ----------- ---------------
* IBM PC/AT 80286-7.5Mhz Venix/286 SVR2 cc 1159 1254 *15
*
*
*----------------DHRYSTONE VERSION 1.0 RESULTS BEGIN--------------------------
*
* MACHINE MICROPROCESSOR OPERATING COMPILER DHRYSTONES/SEC.
* TYPE SYSTEM NO REG REGS
* -------------------------- ------------ ----------- ---------------
* Commodore 64 6510-1MHz C64 ROM C Power 2.8 36 36
* HP-110 8086-5.33Mhz MSDOS 2.11 Lattice 2.14 284 284
* IBM PC/XT 8088-4.77Mhz PC/IX cc 271 294
* CCC 3205 ? Xelos(SVR2) cc 279 296
* Perq-II 2901 bitslice Accent S5c cc (CMU) 301 301
* IBM PC/XT 8088-4.77Mhz COHERENT 2.3.43 MarkWilliams cc 296 317
* Cosmos 68000-8Mhz UniSoft cc 305 322
* IBM PC/XT 8088-4.77Mhz Venix/86 2.0 cc 297 324
* DEC PRO 350 11/23 Venix/PRO SVR2 cc 299 325
* IBM PC 8088-4.77Mhz MSDOS 2.0 b16cc 2.0 310 340
* PDP11/23 11/23 Venix (V7) cc 320 358
* Commodore Amiga ? Lattice 3.02 368 371
* PC/XT 8088-4.77Mhz Venix/86 SYS V cc 339 377
* IBM PC 8088-4.77Mhz MSDOS 2.0 CI-C86 2.20M 390 390
* IBM PC/XT 8088-4.77Mhz PCDOS 2.1 Wizard 2.1 367 403
* IBM PC/XT 8088-4.77Mhz PCDOS 3.1 Lattice 2.15 403 403 @
* Colex DM-6 68010-8Mhz Unisoft SYSV cc 378 410
* IBM PC 8088-4.77Mhz PCDOS 3.1 Datalight 1.10 416 416
* IBM PC NEC V20-4.77Mhz MSDOS 3.1 MS 3.1 387 420
* IBM PC/XT 8088-4.77Mhz PCDOS 2.1 Microsoft 3.0 390 427
* IBM PC NEC V20-4.77Mhz MSDOS 3.1 MS 3.1 (186) 393 427
* PDP-11/34 - UNIX V7M cc 387 438
* IBM PC 8088, 4.77mhz PC-DOS 2.1 Aztec C v3.2d 423 454
* Tandy 1000 V20, 4.77mhz MS-DOS 2.11 Aztec C v3.2d 423 458
* Tandy TRS-16B 68000-6Mhz Xenix 1.3.5 cc 438 458
* PDP-11/34 - RSTS/E decus c 438 495
* Onyx C8002 Z8000-4Mhz IS/1 1.1 (V7) cc 476 511
* CCC 3230 Xelos (SysV.2) cc 507 565
* Tandy TRS-16B 68000-6Mhz Xenix 1.3.5 Green Hills 609 617
* DEC PRO 380 11/73 Venix/PRO SVR2 cc 577 628
* FHL QT+ 68000-10Mhz Os9/68000 version 1.3 603 649 FH
* Apollo DN550 68010-?Mhz AegisSR9/IX cc 3.12 666 666
* HP-110 8086-5.33Mhz MSDOS 2.11 Aztec-C 641 676
* ATT PC6300 8086-8Mhz MSDOS 2.11 b16cc 2.0 632 684
* IBM PC/AT 80286-6Mhz PCDOS 3.0 CI-C86 2.1 666 684
* Tandy 6000 68000-8Mhz Xenix 3.0 cc 694 694
* IBM PC/AT 80286-6Mhz Xenix 3.0 cc 684 704 MM
* Macintosh 68000-7.8Mhz 2M Mac Rom Mac C 32 bit int 694 704
* Macintosh 68000-7.7Mhz - MegaMax C 2.0 661 709
* IBM PC/AT 80286-6Mhz Xenix 3.0 cc 704 714 LM
* Codata 3300 68000-8Mhz UniPlus+ (v7) cc 678 725
* WICAT MB 68000-8Mhz System V WICAT C 4.1 585 731 ~
* Cadmus 9000 68010-10Mhz UNIX cc 714 735
* AT&T 6300 8086-8Mhz Venix/86 SVR2 cc 668 743
* Cadmus 9790 68010-10Mhz 1MB SVR0,Cadmus3.7 cc 720 747
* NEC PC9801F 8086-8Mhz PCDOS 2.11 Lattice 2.15 768 - @
* ATT PC6300 8086-8Mhz MSDOS 2.11 CI-C86 2.20M 769 769
* Burroughs XE550 68010-10Mhz Centix 2.10 cc 769 769 CT1
* EAGLE/TURBO 8086-8Mhz Venix/86 SVR2 cc 696 779
* ALTOS 586 8086-10Mhz Xenix 3.0b cc 724 793
* DEC 11/73 J-11 micro Ultrix-11 V3.0 System V 735 793
* ATT 3B2/300 WE32000-?Mhz UNIX 5.0.2 cc 735 806
* Apollo DN320 68010-?Mhz AegisSR9/IX cc 3.12 806 806
* IRIS-2400 68010-10Mhz UNIX System V cc 772 829
* Atari 520ST 68000-8Mhz TOS DigResearch 839 846
* IBM PC/AT 80286-6Mhz PCDOS 3.0 MS 3.0(large) 833 847 LM
* WICAT MB 68000-8Mhz System V WICAT C 4.1 675 853 S~
* VAX 11/750 - Ultrix 1.1 4.2BSD cc 781 862
* CCC 7350A 68000-8MHz UniSoft V.2 cc 821 875
* VAX 11/750 - UNIX 4.2bsd cc 862 877
* Fast Mac 68000-7.7Mhz - MegaMax C 2.0 839 904 +
* IBM PC/XT 8086-9.54Mhz PCDOS 3.1 Microsoft 3.0 833 909 C1
* DEC 11/44 Ultrix-11 V3.0 System V 862 909
* Macintosh 68000-7.8Mhz 2M Mac Rom Mac C 16 bit int 877 909 S
* CCC 3210 ? Xelos R01(SVR2) cc 849 924
* CCC 3220 ? Ed. 7 v2.3 cc 892 925
* IBM PC/AT 80286-6Mhz Xenix 3.0 cc -i 909 925
* AT&T 6300 8086, 8mhz MS-DOS 2.11 Aztec C v3.2d 862 943
* IBM PC/AT 80286-6Mhz Xenix 3.0 cc 892 961
* VAX 11/750 w/FPA Eunice 3.2 cc 914 976
* IBM PC/XT 8086-9.54Mhz PCDOS 3.1 Wizard 2.1 892 980 C1
* IBM PC/XT 8086-9.54Mhz PCDOS 3.1 Lattice 2.15 980 980 C1
* Plexus P35 68000-10Mhz UNIX System III cc 984 980
* PDP-11/73 KDJ11-AA 15Mhz UNIX V7M 2.1 cc 862 981
* VAX 11/750 w/FPA UNIX 4.3bsd cc 994 997
* IRIS-1400 68010-10Mhz UNIX System V cc 909 1000
* IBM PC/AT 80286-6Mhz Venix/86 2.1 cc 961 1000
* IBM PC/AT 80286-6Mhz PCDOS 3.0 b16cc 2.0 943 1063
* Zilog S8000/11 Z8001-5.5Mhz Zeus 3.2 cc 1011 1084
* NSC ICM-3216 NSC 32016-10Mhz UNIX SVR2 cc 1041 1084
* IBM PC/AT 80286-6Mhz PCDOS 3.0 MS 3.0(small) 1063 1086
* VAX 11/750 w/FPA VMS VAX-11 C 2.0 958 1091
* Stride 68000-10Mhz System-V/68 cc 1041 1111
* Plexus P/60 MC68000-12.5Mhz UNIX SYSIII Plexus 1111 1111
* ATT PC7300 68010-10Mhz UNIX 5.2 cc 1041 1111
* CCC 3230 ? Xelos R01(SVR2) cc 1040 1126
* Stride 68000-12Mhz System-V/68 cc 1063 1136
* IBM PC/AT 80286-6Mhz Venix/286 SVR2 cc 1056 1149
* Plexus P/60 MC68000-12.5Mhz UNIX SYSIII Plexus 1111 1163 T
* IBM PC/AT 80286-6Mhz PCDOS 3.0 Datalight 1.10 1190 1190
* ATT PC6300+ 80286-6Mhz MSDOS 3.1 b16cc 2.0 1111 1219
* IBM PC/AT 80286-6Mhz PCDOS 3.1 Wizard 2.1 1136 1219
* Sun2/120 68010-10Mhz Sun 4.2BSD cc 1136 1219
* IBM PC/AT 80286-6Mhz PCDOS 3.0 CI-C86 2.20M 1219 1219
* WICAT PB 68000-8Mhz System V WICAT C 4.1 998 1226 ~
* MASSCOMP 500 68010-10MHz RTU V3.0 cc (V3.2) 1156 1238
* Alliant FX/8 IP (68012-12Mhz) Concentrix cc -ip;exec -i 1170 1243 FX
* Cyb DataMate 68010-12.5Mhz Uniplus 5.0 Unisoft cc 1162 1250
* PDP 11/70 - UNIX 5.2 cc 1162 1250
* IBM PC/AT 80286-6Mhz PCDOS 3.1 Lattice 2.15 1250 1250
* IBM PC/AT 80286-7.5Mhz Venix/86 2.1 cc 1190 1315 *15
* Sun2/120 68010-10Mhz Standalone cc 1219 1315
* Intel 380 80286-8Mhz Xenix R3.0up1 cc 1250 1315 *16
* Sequent Balance 8000 NS32032-10MHz Dynix 2.0 cc 1250 1315 N12
* IBM PC/DSI-32 32032-10Mhz MSDOS 3.1 GreenHills 2.14 1282 1315 C3
* ATT 3B2/400 WE32100-?Mhz UNIX 5.2 cc 1315 1315
* CCC 3250XP - Xelos R01(SVR2) cc 1215 1318
* IBM PC/RT 032 RISC(801?)?Mhz BSD 4.2 cc 1248 1333 RT
* DG MV4000 - AOS/VS 5.00 cc 1333 1333
* IBM PC/AT 80286-8Mhz Venix/86 2.1 cc 1275 1380 *16
* IBM PC/AT 80286-6Mhz MSDOS 3.0 Microsoft 3.0 1250 1388
* ATT PC6300+ 80286-6Mhz MSDOS 3.1 CI-C86 2.20M 1428 1428
* COMPAQ/286 80286-8Mhz Venix/286 SVR2 cc 1326 1443
* IBM PC/AT 80286-7.5Mhz Venix/286 SVR2 cc 1333 1449 *15
* WICAT PB 68000-8Mhz System V WICAT C 4.1 1169 1464 S~
* Tandy II/6000 68000-8Mhz Xenix 3.0 cc 1384 1477
* WICAT MB 68000-12.5Mhz System V WICAT C 4.1 1246 1537 ~
* IBM PC/AT 80286-9Mhz SCO Xenix V cc 1540 1556 *18
* Cyb DataMate 68010-12.5Mhz Uniplus 5.0 Unisoft cc 1470 1562 S
* VAX 11/780 - UNIX 5.2 cc 1515 1562
* MicroVAX-II - - - 1562 1612
* VAX 11/780 - UNIX 4.3bsd cc 1646 1662
* Apollo DN660 - AegisSR9/IX cc 3.12 1666 1666
* ATT 3B20 - UNIX 5.2 cc 1515 1724
* NEC PC-98XA 80286-8Mhz PCDOS 3.1 Lattice 2.15 1724 1724 @
* HP9000-500 B series CPU HP-UX 4.02 cc 1724 -
* IBM PC/STD 80286-8Mhz MSDOS 3.0 Microsoft 3.0 1724 1785 C2
* WICAT MB 68000-12.5Mhz System V WICAT C 4.1 1450 1814 S~
* WICAT PB 68000-12.5Mhz System V WICAT C 4.1 1530 1898 ~
* DEC-2065 KL10-Model B TOPS-20 6.1FT5 Port. C Comp. 1937 1946
* Gould PN6005 - UTX 1.1(4.2BSD) cc 1675 1964
* DEC2060 KL-10 TOPS-20 cc 2000 2000 &
* VAX 11/785 - UNIX 5.2 cc 2083 2083
* VAX 11/785 - VMS VAX-11 C 2.0 2083 2083
* VAX 11/785 - UNIX SVR2 cc 2123 2083
* VAX 11/785 - ULTRIX-32 1.1 cc 2083 2091
* VAX 11/785 - UNIX 4.3bsd cc 2135 2136
* WICAT PB 68000-12.5Mhz System V WICAT C 4.1 1780 2233 S~
* Pyramid 90x - OSx 2.3 cc 2272 2272
* Pyramid 90x FPA,cache,4Mb OSx 2.5 cc no -O 2777 2777
* Pyramid 90x w/cache OSx 2.5 cc w/-O 3333 3333
* IBM-4341-II - VM/SP3 Waterloo C 1.2 3333 3333
* IRIS-2400T 68020-16.67Mhz UNIX System V cc 3105 3401
* Celerity C-1200 ? UNIX 4.2BSD cc 3485 3468
* SUN 3/75 68020-16.67Mhz SUN 4.2 V3 cc 3333 3571
* IBM-4341 Model 12 UTS 5.0 ? 3685 3685
* SUN-3/160 68020-16.67Mhz Sun 4.2 V3.0A cc 3381 3764
* Sun 3/180 68020-16.67Mhz Sun 4.2 cc 3333 3846
* IBM-4341 Model 12 UTS 5.0 ? 3910 3910 MN
* MC 5400 68020-16.67MHz RTU V3.0 cc (V4.0) 3952 4054
* NCR Tower32 68020-16.67Mhz SYS 5.0 Rel 2.0 cc 3846 4545
* Gould PN9080 - UTX-32 1.1c cc - 4629
* MC 5600/5700 68020-16.67MHz RTU V3.0 cc (V4.0) 4504 4746 %
* Gould 1460-342 ECL proc UTX/32 11/c c    342 677G1
* VX 800  UIX .3bd c 724 088 * AX 600- MS VAX11  2. 712 742
* Aliat F/8 E oncntrx c -c;exc - 652 655FX
* CI PWER6/3 CS(S+4.) c 700 800 * CI OWE 6/2 OWE 6 NIXV c 836 498 * CI OWE 6/2 .2 el.1.2 cc 893 944
* Serr (CI Pwer6) 4.2SD cc 934 000
*CRA-X-P/1 105hz OS .14Cra C   020 020
*IBM308 - UTS5.0Rel1 c  1666 1250
 CRY-1  80hz TSS Cry C2.0 1210 1388
 IB-303 - VMCMSHPO3.4Watrlo C .2 388 388
*Amdhl 70 /8  UT/V .2   ccv1.3  1550 1550
 CRY-XMP/8  10MhzCTS Cay  2.  1525  1757
* Adah 58 - UTS5.0Rel1.2cc 1.5  307 307
*Amdhl 860 TS/ 5.  c v.23  2970 2970 *
* NTE
* * Crytalchagedfro 'sock tolised alu.
  hisMacntoh ws ugraed rom128 to512 insuc a ay hat *   th ne 38K o memory is not slowed down by video generator accesses.
* % Single processor; MC == MASSCOMP
* & A version 7 C compiler written at New Mexico Tech.
* @ vanilla Lattice compiler used with MicroPro standard library
* S Shorts used instead of ints
* T with Chris Torek's patches (whatever they are).
* ~ For WICAT Systems: MB=MultiBus, PB=Proprietary Bus
* LM Large Memory Model. (Otherwise, all 80x8x results are small model)
* MM Medium Memory Model. (Otherwise, all 80x8x results are small model)
* C1 Univation PC TURBO Co-processor; 9.54Mhz 8086, 640K RAM
* C2 Seattle Telecom STD-286 board
* C3 Definicon DSI-32 coprocessor
* C? Unknown co-processor board?
* CT1 Convergent Technologies MegaFrame, 1 processor.
* MN Using Mike Newtons 'optimizer' (see net.sources).
* G1 This Gould machine has 2 processors and was able to run 2 dhrystone
* Benchmarks in parallel with no slowdown.
* FH FHC == Frank Hogg Labs (Hazelwood Uniquad 2 in an FHL box).
* FX The Alliant FX/8 is a system consisting of 1-8 CEs (computation
* engines) and 1-12 IPs (interactive processors). Note N8 applies.
* RT This is one of the RT's that CMU has been using for awhile. I'm
* not sure that this is identical to the machine that IBM is selling
* to the public.
* Nnn This machine has multiple processors, allowing "nn" copies of the
* benchmark to run in the same time as 1 copy.
* ? I don't trust results marked with '?'. These were sent to me with
* either incomplete info, or with times that just don't make sense.
* ?? means I think the performance is too poor, ?! means too good.
* If anybody can confirm these figures, please respond.
*
* ABBREVIATIONS
* CCC Concurrent Computer Corp. (was Perkin-Elmer)
* MC Masscomp
*
*--------------------------------RESULTS END----------------------------------
*
* The following program contains statements of a high-level programming
* language (C) in a distribution considered representative:
*
* assignments 53%
* control statements 32%
* procedure, function calls 15%
*
* 100 statements are dynamically executed. The program is balanced with
* respect to the three aspects:
* - statement type
* - operand type (for simple data types)
* - operand access
* operand global, local, parameter, or constant.
*
* The combination of these three aspects is balanced only approximately.
*
* The program does not compute anything meaningfull, but it is
* syntactically and semantically correct.
*
*/
/* Accuracy of timings and human fatigue controlled by next two lines */
/*#define LOOPS 50000 /* Use this for slow or 16 bit machines */
#define LOOPS 5000 /* Use this for faster machines */
/* Compiler dependent options */
#undef NOENUM /* Define if compiler has no enum's */
#undef NOSTRUCTASSIGN /* Define if compiler can't assign structures */
/* define only one of the next two defines */
#define TIMES /* Use times(2) time function */
/*#define TIME /* Use time(2) time function */
/* define the granularity of your times(2) function (when used) */
/*#define HZ 60 /* times(2) returns 1/60 second (most) */
#define HZ 100 /* times(2) returns 1/100 second (WECo) */
/* for compatibility with goofed up version */
/*#define GOOF /* Define if you want the goofed up version */
#ifdef GOOF
char Version[] = "1.0";
#else
char Version[] = "1.1";
#endif
#ifdef NOSTRUCTASSIGN
#define structassign(d, s) memcpy(&(d), &(s), sizeof(d))
#else
#define structassign(d, s) d = s
#endif
#ifdef NOENUM
#define Ident1 1
#define Ident2 2
#define Ident3 3
#define Ident4 4
#define Ident5 5
typedef int Enumeration;
#else
typedef enum {Ident1, Ident2, Ident3, Ident4, Ident5} Enumeration;
#endif
typedef int OneToThirty;
typedef int OneToFifty;
typedef char CapitalLetter;
typedef char String30[31];
typedef int Array1Dim[51];
typedef int Array2Dim[51][51];
struct Record
{
struct Record *PtrComp;
Enumeration Discr;
Enumeration EnumComp;
OneToFifty IntComp;
String30 StringComp;
};
typedef struct Record RecordType;
typedef RecordType * RecordPtr;
typedef int boolean;
#define NULL 0
#define TRUE 1
#define FALSE 0
#ifndef REG
#define REG
#endif
extern Enumeration Func1();
extern boolean Func2();
#ifdef TIMES
#include <sys/types.h>
#include <sys/times.h>
#endif
main()
{
Proc0();
exit(0);
}
/*
* Package 1
*/
int IntGlob;
boolean BoolGlob;
char Char1Glob;
char Char2Glob;
Array1Dim Array1Glob;
Array2Dim Array2Glob;
RecordPtr PtrGlb;
RecordPtr PtrGlbNext;
Proc0()
{
OneToFifty IntLoc1;
REG OneToFifty IntLoc2;
OneToFifty IntLoc3;
REG char CharLoc;
REG char CharIndex;
Enumeration EnumLoc;
String30 String1Loc;
String30 String2Loc;
extern char *malloc();
#ifdef TIME
long time();
long starttime;
long benchtime;
long nulltime;
register unsigned int i;
starttime = time( (long *) 0);
for (i = 0; i < LOOPS; ++i);
nulltime = time( (long *) 0) - starttime; /* Computes o'head of loop */
#endif
#ifdef TIMES
time_t starttime;
time_t benchtime;
time_t nulltime;
struct tms tms;
register unsigned int i;
times(&tms); starttime = tms.tms_utime;
for (i = 0; i < LOOPS; ++i);
times(&tms);
nulltime = tms.tms_utime - starttime; /* Computes overhead of looping */
#endif
PtrGlbNext = (RecordPtr) malloc(sizeof(RecordType));
PtrGlb = (RecordPtr) malloc(sizeof(RecordType));
PtrGlb->PtrComp = PtrGlbNext;
PtrGlb->Discr = Ident1;
PtrGlb->EnumComp = Ident3;
PtrGlb->IntComp = 40;
strcpy(PtrGlb->StringComp, "DHRYSTONE PROGRAM, SOME STRING");
#ifndef GOOF
strcpy(String1Loc, "DHRYSTONE PROGRAM, 1'ST STRING"); /*GOOF*/
#endif
Array2Glob[8][7] = 10; /* Was missing in published program */
/*****************
-- Start Timer --
*****************/
#ifdef TIME
starttime = time( (long *) 0);
#endif
#ifdef TIMES
times(&tms); starttime = tms.tms_utime;
#endif
for (i = 0; i < LOOPS; ++i)
{
Proc5();
Proc4();
IntLoc1 = 2;
IntLoc2 = 3;
strcpy(String2Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
EnumLoc = Ident2;
BoolGlob = ! Func2(String1Loc, String2Loc);
while (IntLoc1 < IntLoc2)
{
IntLoc3 = 5 * IntLoc1 - IntLoc2;
Proc7(IntLoc1, IntLoc2, &IntLoc3);
++IntLoc1;
}
Proc8(Array1Glob, Array2Glob, IntLoc1, IntLoc3);
Proc1(PtrGlb);
for (CharIndex = 'A'; CharIndex <= Char2Glob; ++CharIndex)
if (EnumLoc == Func1(CharIndex, 'C'))
Proc6(Ident1, &EnumLoc);
IntLoc3 = IntLoc2 * IntLoc1;
IntLoc2 = IntLoc3 / IntLoc1;
IntLoc2 = 7 * (IntLoc3 - IntLoc2) - IntLoc1;
Proc2(&IntLoc1);
}
/*****************
-- Stop Timer --
*****************/
#ifdef TIME
benchtime = time( (long *) 0) - starttime - nulltime;
printf("Dhrystone(%s) time for %ld passes = %ld\n",
Version,
(long) LOOPS, benchtime);
printf("This machine benchmarks at %ld dhrystones/second\n",
((long) LOOPS) / benchtime);
#endif
#ifdef TIMES
times(&tms);
benchtime = tms.tms_utime - starttime - nulltime;
printf("Dhrystone(%s) time for %ld passes = %ld\n",
Version,
(long) LOOPS, benchtime/HZ);
printf("This machine benchmarks at %ld dhrystones/second\n",
((long) LOOPS) * HZ / benchtime);
#endif
}
Proc1(PtrParIn)
REG RecordPtr PtrParIn;
{
#define NextRecord (*(PtrParIn->PtrComp))
structassign(NextRecord, *PtrGlb);
PtrParIn->IntComp = 5;
NextRecord.IntComp = PtrParIn->IntComp;
NextRecord.PtrComp = PtrParIn->PtrComp;
Proc3(NextRecord.PtrComp);
if (NextRecord.Discr == Ident1)
{
NextRecord.IntComp = 6;
Proc6(PtrParIn->EnumComp, &NextRecord.EnumComp);
NextRecord.PtrComp = PtrGlb->PtrComp;
Proc7(NextRecord.IntComp, 10, &NextRecord.IntComp);
}
else
structassign(*PtrParIn, NextRecord);
#undef NextRecord
}
Proc2(IntParIO)
OneToFifty *IntParIO;
{
REG OneToFifty IntLoc;
REG Enumeration EnumLoc;
IntLoc = *IntParIO + 10;
for(;;)
{
if (Char1Glob == 'A')
{
--IntLoc;
*IntParIO = IntLoc - IntGlob;
EnumLoc = Ident1;
}
if (EnumLoc == Ident1)
break;
}
}
Proc3(PtrParOut)
RecordPtr *PtrParOut;
{
if (PtrGlb != NULL)
*PtrParOut = PtrGlb->PtrComp;
else
IntGlob = 100;
Proc7(10, IntGlob, &PtrGlb->IntComp);
}
Proc4()
{
REG boolean BoolLoc;
BoolLoc = Char1Glob == 'A';
BoolLoc |= BoolGlob;
Char2Glob = 'B';
}
Proc5()
{
Char1Glob = 'A';
BoolGlob = FALSE;
}
extern boolean Func3();
Proc6(EnumParIn, EnumParOut)
REG Enumeration EnumParIn;
REG Enumeration *EnumParOut;
{
*EnumParOut = EnumParIn;
if (! Func3(EnumParIn) )
*EnumParOut = Ident4;
switch (EnumParIn)
{
case Ident1: *EnumParOut = Ident1; break;
case Ident2: if (IntGlob > 100) *EnumParOut = Ident1;
else *EnumParOut = Ident4;
break;
case Ident3: *EnumParOut = Ident2; break;
case Ident4: break;
case Ident5: *EnumParOut = Ident3;
}
}
Proc7(IntParI1, IntParI2, IntParOut)
OneToFifty IntParI1;
OneToFifty IntParI2;
OneToFifty *IntParOut;
{
REG OneToFifty IntLoc;
IntLoc = IntParI1 + 2;
*IntParOut = IntParI2 + IntLoc;
}
Proc8(Array1Par, Array2Par, IntParI1, IntParI2)
Array1Dim Array1Par;
Array2Dim Array2Par;
OneToFifty IntParI1;
OneToFifty IntParI2;
{
REG OneToFifty IntLoc;
REG OneToFifty IntIndex;
IntLoc = IntParI1 + 5;
Array1Par[IntLoc] = IntParI2;
Array1Par[IntLoc+1] = Array1Par[IntLoc];
Array1Par[IntLoc+30] = IntLoc;
for (IntIndex = IntLoc; IntIndex <= (IntLoc+1); ++IntIndex)
Array2Par[IntLoc][IntIndex] = IntLoc;
++Array2Par[IntLoc][IntLoc-1];
Array2Par[IntLoc+20][IntLoc] = Array1Par[IntLoc];
IntGlob = 5;
}
Enumeration Func1(CharPar1, CharPar2)
CapitalLetter CharPar1;
CapitalLetter CharPar2;
{
REG CapitalLetter CharLoc1;
REG CapitalLetter CharLoc2;
CharLoc1 = CharPar1;
CharLoc2 = CharLoc1;
if (CharLoc2 != CharPar2)
return (Ident1);
else
return (Ident2);
}
boolean Func2(StrParI1, StrParI2)
String30 StrParI1;
String30 StrParI2;
{
REG OneToThirty IntLoc;
REG CapitalLetter CharLoc;
IntLoc = 1;
while (IntLoc <= 1)
if (Func1(StrParI1[IntLoc], StrParI2[IntLoc+1]) == Ident1)
{
CharLoc = 'A';
++IntLoc;
}
if (CharLoc >= 'W' && CharLoc <= 'Z')
IntLoc = 7;
if (CharLoc == 'X')
return(TRUE);
else
{
if (strcmp(StrParI1, StrParI2) > 0)
{
IntLoc += 7;
return (TRUE);
}
else
return (FALSE);
}
}
boolean Func3(EnumParIn)
REG Enumeration EnumParIn;
{
REG Enumeration EnumLoc;
EnumLoc = EnumParIn;
if (EnumLoc == Ident3) return (TRUE);
return (FALSE);
}
#ifdef NOSTRUCTASSIGN
memcpy(d, s, l)
register char *d;
register char *s;
register int l;
{
while (l--) *d++ = *s++;
}
#endif
#if 0
IntLoc;
}
if (CharLoc >= 'W' && CharLoc <= 'Z')
IntLoc = 7;
if (CharLoc == 'X')
return(TRUE);
else
{
if (strcmp(StrParI1, StrParI2) > 0)
{
IntLoc += 7;
return (TRUE);
}
else
return (FALSE);
}
}
#endif
13.50e+0 9.00e+0 8.00e+0 Box dimensions
0.80e+0 0.99e+0 0.54e+0 0.84e+0 0.01e+0 0.84e+0
0.80e+0 0.01e+0 0.54e+0 0.84e+0 0.01e+0 0.84e+0
0.80e+0 0.01e+0 0.54e+0 0.84e+0 0.99e+0 0.84e+0 Reflectivities (RGB)
1.27e+0 0.00e+0 0.00e+0 0.00e+0 0.00e+0 0.00e+0
1.27e+0 0.00e+0 0.00e+0 0.00e+0 0.00e+0 0.00e+0
1.27e+0 0.00e+0 0.00e+0 0.00e+0 0.00e+0 0.00e+0 Emissivities (RGB)
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[])
{
int i, its, hits = 0;
double d1, d2;
if (argc != 2) {
fprintf(stderr, "Usage: %s <iterations>\n", argv[0]);
exit(0);
}
its = atoi(argv[1]);
srandom(1);
for (i = 0; i < its; i++) {
d1 = ((double)random())/2147483647.0;
d2 = ((double)random())/2147483647.0;
if (((d1*d1) + (d2*d2)) <= 1)
hits++;
}
printf("%.10f\n", (double)4.0 * (double) ((double)hits / (double)its));
return 1;
}
/******************************************************************************
S L A L O M
Scalable Language-independent Ames Laboratory One-minute Measurement
The following program is the first benchmark based on fixed time rather
than fixed problem comparison. Not only is fixed time more representative
of the way people use computers, it also greatly increases the scope and
longevity of the benchmark. SLALOM is very scalable, and can be used to
compare computers as slow as 126 floating-point operations per second
(FLOPS) to computers running a trillion times faster. The scalability can
be used to compare single processors to massively parallel collections
of processors, and to study the space of problem size vs. ensemble size
in fine detail. It resembles the LINPACK benchmark since it involves
factoring and backsolving a (nearly) dense matrix, but incorporates a
number of improvements to that benchmark that we hope will make SLALOM
a better reflection of general system performance.
The SLALOM benchmark solves a complete, real problem (optical radiosity
on the interior of a box), not a contrived kernel or a synthetic mixture of
sample operations. SLALOM is unusual since it times input, problem setup,
solution, and output, not just the solution. For slower computers, the
problem setup will take the majority of the time; it grows as the square of
the problem size. The solver grows as the cube of the problem size, and
dominates the time for large values of n.
While the following is C, you are free to translate it into any
language you like, including assembly language specific to one computer.
You may use compiler directives, hand-tuned library calls, loop unrolling,
and even change the algorithm, if you can provide a convincing argument
that the program still works for the full range of possible inputs. For
example, if you replace the direct solver with an iterative one, you must
make sure your method is correct even when the geometry is quite eccentric
and the box faces are highly reflective. (rho = .999)
The main() driver should be used with the value of 60 seconds for the
SLALOM benchmark. The work done for a particular problem size is figured
after timing has ceased, so there is no overhead for work assessment. The
residual check ||Ax - b|| is also done after timing has ceased. Two
computers may be compared either by their problem size n, or by their MFLOPS
rate, never by the ratio of execution times. Times will always be near one
minute in SLALOM. We have used the following weights for floating-point
operation counting, based on the weights used by Lawrence Livermore National
Laboratory:
OPERATION WEIGHT
a=b, a=(constant) 0
a<0, a<=0, a==0, a!=0, a>0, a>=0 0
-a, fabs(a), fsgn(a, b) 0
a+b, a-b, a*b, a^2 1
a<b, a<=b, a==b, a!=b, a>b, a>=b 1
(int) a, (double)b 1
1/a, -1/a 3
a/b 4
sqrt(a) 4
Format to or from ASCII string 6
sin(a), cos(a), tan(a), log(a), atan(a), exp(a) 8
We invite you to share with us the results of any measurements that you
make with SLALOM. We do NOT accept anonymous data; machine timings will be
referenced and dated.
The least you need to do to adapt SLALOM to your computer is:
1. In the "Measure" routine, set NMAX to a value large enough to keep
the computer working for a minute. Vary it slightly if it helps
(for reasons of cache size, interleaving, etc.)
2. Replace the timer call in "When" with the most accurate wall-clock
timer at your disposal. If only CPU time is available, try to run
the job standalone or at high priority, since we are ultimately
interested in the top of the statistical range of performance.
3. Edit in the information specific to your test in the "What"
routine, so that final output will be automatically annotated.
4. Compile, link, and run the program, interacting to select values
of n that bracket a time of one minute. Once everything is
running, run it as a batch job so as to record the session.
Examples of ways you may optimize performance:
1. Unroll the loops in SetUp1 and SetUp2; it is possible to
vectorize both SetUp1 and SetUp2 at the cost of some extra
operations, program complexity, and storage.
2. Replace the innermost loops of Solver with calls to well-tuned
libraries of linear algebra routines, such as DDOT from the
Basic Linear Algebra Subroutines (level 1 BLAS). Better still,
use a tuned library routine for all of Solver; the sparsity
exploited in Solver is only a few percent, so you will usually
gain more than you lose by applying a dense symmetric solver.
3. Parallelize the SetUp and Solver routines; all are highly
parallel. Each element of the matrix can be constructed
independently, once each processor knows the geometry and part of
the partitioning into regions. A substantial body of literature
now exists for performing the types of operations in Solver in
parallel.
4. Overlap computation with output. Once the Region routine is done,
the first part of the output file (patch geometry) can be written
while the radiosities are being calculated.
Examples of what you may NOT do:
1. The tuning must not be made specific to the particular input
provided. For example, you may not eliminate IF tests simply
because they always come out the same way for this input; you
may not use precomputed answers or table look-up unless those
answers and tables cover the full range of possible inputs; and
you may not exploit symmetry for even values of the problem size.
2. You may not disable the self-consistency tests in SetUp3 and
Verify, nor alter their tolerance constants.
3. You may not change the input or output files to unformatted
binary or other format that would render them difficult to create
or read for humans.
4. You may not eliminate the reading of the "geom" file by putting
its data directly into the compiled program.
5. You may not change any of the work assessments in Meter. If you
use more floating-point operations than indicated, you must still
use the assessments provided. If you find a way to use fewer
operations and still get the job done for arbitrary input
parameters, please tell us!
-John Gustafson, Diane Rover, Michael Carter,
and Stephen Elbert
Ames Laboratory, Ames, Iowa 50011
******************************************************************************/
/*****************************************************************************/
/* The following program finds a value n such that a problem of size n */
/* takes just under "goal" seconds to execute. */
/* */
/* John Gustafson, Diane Rover, Michael Carter, and Stephen Elbert */
/* Ames Laboratory, 3/18/90 */
/* */
/* Calls: Meter Measures execution time for some application. */
/* What Prints work-timing statistics and system information. */
/*****************************************************************************/
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
/* NMAX = Largest npatch for your computer; adjust as needed. */
#define NMAX 2048
#define EPS (0.5e-8)
#define FALSE (1==0)
#define TRUE (!FALSE)
#define MAX(a,b) (((a) > (b)) ? (a) : (b))
/* Global variables and function return types: */
double goal, /* User input, fixed-time benchmark goal, in seconds. */
timing, /* Elapsed time returned by Meter routine, in seconds.*/
work, /* In this case, number of FLOPs performed. */
When(), /* Wall clock in seconds. */
Ddot(); /* Double dot product. */
int mean, /* Avg between upper and lower bounds for bisection */
/* method. */
n, /* The problem size. */
nupper, /* Upper bound on problem size, used in iterating */
/* toward goal. */
Meter(), /* Driver for following benchmark functions. */
Reader (), /* Reads problem description from 'geom' file. */
Region (), /* Subdivides box faces into patches. */
SetUp3 (), /* Set up matrix to solve. */
Storer (), /* Write result to 'answer' file. */
Verify (); /* Verify the radiosity solution from solver. */
void SetUp1 (), /* Set up matrix to solve. */
SetUp2 (), /* Set up matrix to solve. */
Solver (); /* Solve the radiosity matrix. */
main ()
{
int ok; /* Return code temporary storage. */
/* Get desired number of seconds: */
printf ("Enter the number of seconds that is the goal: ");
scanf ("%lg", &goal);
/* Get lower and upper bounds for n from the standard input device: */
do {
printf ("Enter a lower bound for n: ");
scanf ("%d", &n);
if (n <= 0)
exit(0);
ok = Meter (n, &timing, &work);
if (timing >= goal)
printf ("Must take less than %g seconds. Took %g.\n",
goal, timing);
} while (!ok || timing >= goal);
do {
printf ("Enter an upper bound for n: ");
scanf ("%d", &nupper);
if (nupper <= 0)
exit(0);
ok = Meter (nupper, &timing, &work);
if (timing < goal) {
printf ("Must take at least %g seconds. Took %g.\n",
goal, timing);
n = MAX(nupper, n);
}
} while (!ok || timing < goal);
/*
* While the [n, nupper] interval is larger than 1, bisect it and
* pick a half:
*/
while (nupper - n > 1) {
mean = (n + nupper) / 2;
ok = Meter (mean, &timing, &work);
if (timing < goal)
n = mean;
else
nupper = mean;
printf ("New interval: [%d,%d]\n", n, nupper);
}
/* Ensure that most recent run was for n, not nupper. */
ok = Meter (n, &timing, &work);
/* Print out final statistics. */
What (n, timing, work);
}
/*****************************************************************************/
/* This routine should be edited to contain information for your system. */
/*****************************************************************************/
What (n, timing, work)
int n;
double timing, work;
{
int i;
static char *info[] = {
"Machine: SUN 4/370GX Processor: SPARC",
"Memory: 32 MB # of procs: 1",
"Cache: 128 KB # used: 1",
"NMAX: 512 Clock: 25 MHz",
"Disk: .3GB SCSI+.7GB SMD Node name: amssun2",
"OS: SUNOS 4.0.3 Timer: Wall, gettimeofday()",
"Language: C Alone: yes",
"Compiler: cc Run by: M. Carter",
"Options: -O Date: 23 May 1990",
NULL
};
printf ("\n");
for (i = 0 ; info[i] ; i++)
puts (info[i]);
printf ("M ops: %-13lg Time: %-.3lf seconds\n",
work * 1e-6, timing);
printf ("n: %-6d MFLOPS: %-.5lg\n",
n, (work / timing) * 1e-6);
printf ("Approximate data memory use: %d bytes.\n",
8 * n * n + 120 * n + 800);
}
/*****************************************************************************/
/* This routine measures time required on a revised LINPACK-type benchmark, */
/* including input, matrix generation, solution, and output. */
/* */
/* John Gustafson, Diane Rover, Michael Carter, and Stephen Elbert */
/* Ames Laboratory, 3/18/90 */
/* */
/* Calls: Reader Reads the problem description from secondary storage. */
/* Region Partitions box surface into rectangular regions (patches).*/
/* SetUp1 Sets up equations from patch geometries-parallel faces. */
/* SetUp2 Sets up equations from patch geometries-orthogonal faces. */
/* SetUp3 Sets up equations-row normalization and radiant props. */
/* Solver Solves the equations by LDL factorization. */
/* Storer Stores solution (patch radiosities) on secondary storage. */
/* When Returns wall-clock time, in seconds. */
/*****************************************************************************/
Meter (npatch, timing, work)
int npatch; /* In, problem size, here the number of equations. */
double *timing, /* Out, number of seconds measured. */
*work; /* Out, work done, here the number of FLOPs. */
{
static
double area[NMAX], /* Areas of patches * 8 * pi. */
box[7], /* Dimensions of box in x, y, z directions. */
coeff[NMAX][NMAX], /* The coefficients of the eqns to solve. */
diag[3][NMAX], /* Diag terms of the eqns to solve. (RGB) */
emiss[6][3], /* (RGB) emissivities of patches. */
place[3][NMAX], /* Width-height-depth position of patches. */
result[3][NMAX], /* Answer radiosities (RGB). */
rho[6][3], /* (RGB) Reflectivities of patches. */
rhs[3][NMAX], /* Right-hand sides of eqns to solve (RGB). */
size[2][NMAX]; /* Width-height sizes of patches. */
double ops[8], /* Floating-point operation counts. */
p[6], /* Number of patches in faces. */
sec[8], /* Times for routines, in seconds. */
tmp1, tmp2; /* Double temporary variables. */
int i, /* Loop counter. */
itmp1, /* Integer temporary variable. */
non0; /* Index of first nonzero off-diagonal elem. */
static
int loop[6][2]; /* Patch number ranges for faces. */
static char *tasks[] = { /* Names of all the functions in benchmark. */
"Reader", "Region",
"SetUp1", "SetUp2",
"SetUp3", "Solver",
"Storer"
};
static char *format = /* Output line format. */
"%6.6s%8.3f%17.0f%14.6f%10.1f %%\n";
/* First check that npatch lies between 6 and NMAX: */
if (npatch < 6) {
printf ("Must be at least 6, the number of faces.\n");
return (FALSE);
}
else if (npatch > NMAX) {
printf ("Exceeds %d = maximum for this system.\n", NMAX);
return (FALSE);
}
/* Ensure that previous 'answer' file is deleted: */
unlink ("answer");
/* Time the tasks, individually and collectively. */
sec[0] = When();
if (!Reader (box, rho, emiss))
return (FALSE);
sec[1] = When();
if (!Region (npatch, loop, box, place, size, area))
return (FALSE);
sec[2] = When();
SetUp1 (npatch, loop, coeff, place, size);
sec[3] = When();
SetUp2 (npatch, loop, coeff, place, size);
sec[4] = When();
if (!SetUp3 (npatch, loop, area, rho, emiss, coeff, diag, rhs))
return (FALSE);
sec[5] = When();
non0 = loop[1][0];
Solver (npatch, non0, coeff, diag, rhs, result);
sec[6] = When();
Storer (npatch, loop, place, size, result);
sec[7] = When();
*timing = sec[7] - sec[0];
for (i = 0 ; i < 7 ; i++)
sec[i] = sec[i+1] - sec[i];
/* Assess floating-point work done by each routine called, and total: */
/* Note the ops counts are talleyed into a double array, and there */
/* some strange casts to double in some equations. This is to */
/* prevent integer overflow. */
itmp1 = 0;
tmp1 = 0.0;
for (i = 0 ; i < 6 ; i++) {
p[i] = loop[i][1] - loop[i][0] + 1;
tmp1 += p[i] * p[i];
itmp1 += sqrt(p[i] * box[i] / box[i + 1]) + 0.5;
}
tmp2 = p[0] * p[3] + p[1] * p[4] + p[2] * p[5];
ops[0] = 258;
ops[1] = 154 + (double) 8 * itmp1 + npatch;
ops[2] = 6 + 532 * tmp2;
ops[3] = 8*npatch + 370 * ((double) npatch * npatch - tmp1 - 2*tmp2) / 2.0;
ops[4] = 72 + (double) 9 * npatch + (double) npatch * npatch - tmp1;
ops[5] = npatch * (npatch * ((double) npatch + 7.5) - 2.5) - 21
+ (non0+1) * ((non0+1) * (2 * ((double) non0+1) - 16.5) + 35.5)
+ (non0+1) * npatch * (9 - 3 * ((double) non0+1));
ops[6] = 48 * npatch;
*work = ops[0] + ops[1] + ops[2] + ops[3] + ops[4] + ops[5] + ops[6];
/* Display timing-work-speed breakdown by routine. */
printf ("%d patches:\n", npatch);
printf (" Task Seconds Operations MFLOPS %% of Time\n");
for (i = 0 ; i < 7 ; i++) {
if (sec[i] == 0.0)
sec[i] = 0.001;
printf (format, tasks[i], sec[i], ops[i], (ops[i] / sec[i]) * 1e-6,
100.0 * sec[i] / *timing);
}
printf (format, "TOTALS", *timing, *work, (*work / *timing) * 1e-6, 100.0);
Verify (npatch, coeff, diag, rhs, result);
return (TRUE);
}
/*****************************************************************************/
/* This function should return the actual, wall clock time (not CPU time) */
/* in seconds as accurately as possible. Change it to your system timer. */
/*****************************************************************************/
double
When()
{
struct timeval tp;
struct timezone tzp;
gettimeofday (&tp, &tzp);
return ((double) tp.tv_sec + (double) tp.tv_usec * 1e-6);
}
/*****************************************************************************/
/* The following routine reads in the problem description from secondary */
/* storage, and checks that numbers are in reasonable ranges. */
/*****************************************************************************/
Reader (box, rho, emiss)
double box[], /* Out: Dimensions of box in x, y, z directions. */
rho[][3], /* Out: (RGB) Reflectivities of patches. */
emiss[][3]; /* Out: (RGB) emissivities of patches. */
{
/*
* Local variables:
* infile Device number for input file.
* i, j Loop counters.
* tmp1 Maximum emissivity, to check that emissivities are not all 0.
*/
int i, j, /* Loop counters. */
n; /* Number of args fscanf()'ed from file. */
double tmp1; /* Maximum emissivity. */
FILE *infile; /* Input file pointer. */
char buff[81]; /* Buffer used to eat a line of input. */
/* Open the input file and read in the data. */
if ((infile = fopen ("geom", "r")) == NULL) {
printf ("slalom: 'geom' geometry file not found.\n");
exit (1);
}
/* Read the box coordinates and error check. */
n = 0;
for (i = 0 ; i < 3 ; i++) {
n += fscanf (infile, "%lg", &box[i]);
}
fgets (buff, 80, infile); /* Eat the rest of the line. */
if (n != 3) {
printf ("Must specify exactly 3 box coordinates.\n");
exit(1);
}
/* Read the reflectivities and error check. */
n = 0;
for (j = 0 ; j < 3 ; j++) {
for (i = 0 ; i < 6 ; i++) {
n += fscanf (infile, "%lg", &rho[i][j]);
}
}
fgets (buff, 80, infile); /* Eat the rest of the line. */
if (n != 18) {
printf ("Must specify exactly 18 box coordinates.\n");
exit(1);
}
/* Read the emissivities and error check. */
n = 0;
for (j = 0 ; j < 3 ; j++) {
for (i = 0 ; i < 6 ; i++) {
n += fscanf (infile, "%lg", &emiss[i][j]);
}
}
fgets (buff, 80, infile); /* Eat the rest of the line. */
if (n != 18) {
printf ("Must specify exactly 18 box coordinates.\n");
exit(1);
}
fclose (infile);
/* Now sanity-check the values that were just read. */
for (j = 0 ; j < 3 ; j++) {
if (box[j] < 1.0 || box[j] >= 100.0) {
printf ("Box dimensions must be between 1 and 100.\n");
return (FALSE);
}
box[j+3] = box[j];
tmp1 = 0.0;
for (i = 0 ; i < 6 ; i++) {
if (rho[i][j] < 0.000 || rho[i][j] > 0.999) {
printf ("Reflectivities must be between .000 and .999.\n");
return (FALSE);
}
if (emiss[i][j] < 0.0) {
printf ("Emissivity cannot be negative.\n");
return (FALSE);
}
if (tmp1 < emiss[i][j])
tmp1 = emiss[i][j];
}
if (tmp1 == 0.0) {
printf ("Emissivities are zero. Problem is trivial.\n");
return (FALSE);
}
}
box[6] = box[3];
return (TRUE);
}
/*****************************************************************************/
/* The following routine decomposes the surface of a variable-sized box */
/* into patches that are as nearly equal in size and square as possible. */
/*****************************************************************************/
Region (npatch, loop, box, place, size, area)
int npatch, /* In: Problem size. */
loop[][2]; /* Out: Patch number ranges for faces. */
double area[], /* Out: 8pi * areas of the patches. */
box[], /* In: Dimensions of box in x, y, z directions. */
place[][NMAX], /* Out: Width-height-depth positions of patches. */
size[][NMAX]; /* Out: Width-height sizes of patches. */
{
int icol, /* Loop counter over the number of columns. */
ipatch, /* Loop counter over the number of patches. */
iface, /* Loop counter over the number of faces. */
itmp1, /* Integer temporary variables. */
itmp2, /* Integer temporary variables. */
last, /* Inner loop ending value. */
lead, /* Inner loop starting value. */
numcol, /* Number of columns on faces. */
numpat, /* Number of patches on a face. */
numrow; /* Number of rows of patches in a column. */
double height, /* Height of a patch within a column. */
tmp1, /* double temporary variables. */
tmp2, /* double temporary variables. */
tmp3, /* double temporary variables. */
tmp4, /* double temporary variables. */
width; /* Width of a column of patches. */
/* Allocate patches to each face, proportionate to area of each face. */
tmp1 = 2.0 * (box[0] * box[1] + box[1] * box[2] + box[2] * box[0]);
tmp2 = 0.0;
tmp3 = npatch;
loop[0][0] = 0;
for (iface = 0 ; iface < 5 ; iface++) {
tmp2 = tmp2 + box[iface] * box[iface + 1];
loop[iface][1] = (int) (tmp3 * tmp2 / tmp1 + 0.5) - 1;
loop[iface + 1][0] = loop[iface][1] + 1;
}
loop[5][1] = npatch - 1;
/* Subdivide each face into numpat patches. */
for (iface = 0 ; iface < 6 ; iface++) {
numpat = loop[iface][1] - loop[iface][0] + 1;
tmp3 = 0.0;
if (iface >= 3)
tmp3 = box[iface-1];
numcol = (int) (sqrt(numpat * box[iface] / box[iface + 1]) + 0.5);
if (numcol > numpat)
numcol = numpat;
if (numcol == 0)
numcol = 1;
width = box[iface] / numcol;
itmp1 = numcol - 1;
tmp1 = 0.0;
for (icol = 0 ; icol < numcol ; icol++) {
itmp2 = itmp1 / numcol;
numrow = (itmp1 + numpat) / numcol - itmp2;
if (numrow == 0) {
printf ("Eccentric box requires more patches.\n");
return (FALSE);
}
height = box[iface + 1] / numrow;
tmp2 = 0.0;
tmp4 = width * height * (8.0 * M_PI);
lead = loop[iface][0] + itmp2;
last = lead + numrow;
for (ipatch = lead ; ipatch < last ; ipatch++) {
size[0][ipatch] = width;
size[1][ipatch] = height;
place[0][ipatch] = tmp1;
place[1][ipatch] = tmp2;
place[2][ipatch] = tmp3;
area[ipatch] = tmp4;
tmp2 = tmp2 + height;
}
tmp1 = tmp1 + width;
itmp1 = itmp1 + numpat;
}
}
return (TRUE);
}
/*****************************************************************************/
/* This routine sets up the radiosity matrix for parallel patches. */
/*****************************************************************************/
void
SetUp1 (npatch, loop, coeff, place, size)
int npatch, /* In: Problem size. */
loop[][2]; /* In: Patch number ranges for faces. */
double coeff[][NMAX], /* Out: The coefficients of the eqns to solve. */
place[][NMAX], /* In: Width-height-depth positions of patches. */
size[][NMAX]; /* In: Width-height sizes of patches. */
{
int i, j, k, /* General loop counters. */
m, n, /* General loop counters. */
iface, /* Loop counter over the number of faces. */
ipatch, /* Loop counter over the number of patches. */
jface, /* Face coupled to iface when computing mat. elems. */
jpatch; /* Patch coupled to ipatch when computing mat. elems.*/
double d[2][2][2], /* Point-to-point couplings between patch corners. */
d2[2][2][2],/* Squares of d values, to save recomputation. */
tmp1, tmp2, /* Double temporary variables. */
tmp3, tmp4, /* Double temporary variables. */
tmp5, tmp6, /* Double temporary variables. */
tmp7, tmp8; /* Double temporary variables. */
for (iface = 0 ; iface < 3 ; iface++) {
jface = iface + 3;
tmp1 = place[2][loop[jface][0]] * place[2][loop[jface][0]];
tmp6 = tmp1 + tmp1;
for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
for (jpatch=loop[jface][0] ; jpatch <= loop[jface][1] ; jpatch++) {
for (j = 0 ; j < 2 ; j++) {
d [0][0][j] = place[j][jpatch] - place[j][ipatch];
d [1][0][j] = d[0][0][j] + size[j][jpatch];
d [0][1][j] = d[0][0][j] - size[j][ipatch];
d [1][1][j] = d[1][0][j] - size[j][ipatch];
d2[0][0][j] = d[0][0][j] * d[0][0][j];
d2[1][0][j] = d[1][0][j] * d[1][0][j];
d2[0][1][j] = d[0][1][j] * d[0][1][j];
d2[1][1][j] = d[1][1][j] * d[1][1][j];
}
tmp2 = 0.0;
for (m = 0 ; m < 2 ; m++) {
for (i = 0 ; i < 2 ; i++) {
tmp3 = d2[m][i][1] + tmp1;
tmp4 = sqrt(tmp3);
tmp5 = 1.0 / tmp4;
tmp8 = 0.0;
for (k = 0 ; k < 2 ; k++) {
for (n = 0 ; n < 2 ; n++) {
tmp7 = d[k][n][0];
tmp8 = -tmp7 * atan(tmp7 * tmp5) - tmp8;
}
tmp8 = -tmp8;
}
tmp2 = -4.0 * tmp4 * tmp8 - tmp2 - tmp6 *
log(((d2[1][0][0] + tmp3) * (d2[0][1][0] + tmp3)) /
((d2[0][0][0] + tmp3) * (d2[1][1][0] + tmp3)));
}
tmp2 = -tmp2;
}
for (m = 0 ; m < 2 ; m++) {
for (i = 0 ; i < 2 ; i++) {
tmp4 = sqrt(d2[m][i][0] + tmp1);
tmp5 = 1.0 / tmp4;
tmp8 = 0.0;
for (k = 0 ; k < 2 ; k++) {
for (n = 0 ; n < 2 ; n++) {
tmp7 = d[k][n][1];
tmp8 = -tmp7 * atan(tmp7 * tmp5) - tmp8;
}
tmp8 = -tmp8;
}
tmp2 = -4.0 * tmp4 * tmp8 - tmp2;
}
tmp2 = -tmp2;
}
coeff[ipatch][jpatch] = tmp2;
coeff[jpatch][ipatch] = tmp2;
}
}
}
}
/*****************************************************************************/
/* This routine sets up the radiosity matrix for orthogonal patches. */
/*****************************************************************************/
void
SetUp2 (npatch, loop, coeff, place, size)
int npatch, /* In: Problem size. */
loop[][2]; /* In: Patch number ranges for faces. */
double coeff[][NMAX], /* Out: The coefficients of the eqns to solve. */
place[][NMAX], /* In: Width-height-depth positions of patches. */
size[][NMAX]; /* In: Width-height sizes of patches. */
{
int m, /* General loop counters. */
iface, /* Loop counter over the number of faces. */
ipatch, /* Loop counter over the number of patches. */
jface, /* Face coupled to iface when computing mat. elems. */
jpatch; /* Patch coupled to ipatch when computing mat. elems.*/
double tmpb, tmpa,
c11d, c12d, c21d, c22d, c11s, c12s, c21s, c22s,
d11d, d12d, d21d, d22d, d11s, d12s, d21s, d22s,
d11i, d12i, d21i, d22i, a10s, a20s, b01s, b02s,
e1111, e1211, e2111, e2211, e1112, e1212, e2112, e2212,
e1121, e1221, e2121, e2221, e1122, e1222, e2122, e2222;
for (iface = 0 ; iface < 6 ; iface++) {
for (m = 0 ; m < 2 ; m++) {
jface = (iface + m + 1) % 6;
for (ipatch=loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
a10s = place[m][ipatch] - place[2][loop[jface][0]];
a20s = a10s + size[m][ipatch];
a10s = a10s * a10s;
a20s = a20s * a20s;
for (jpatch=loop[jface][0] ; jpatch<=loop[jface][1];jpatch++) {
c11d = place[m][jpatch] - place[1-m][ipatch];
c12d = c11d + size[m][jpatch];
c21d = c11d - size[1-m][ipatch];
c22d = c12d - size[1-m][ipatch];
c11s = c11d * c11d;
c12s = c12d * c12d;
c21s = c21d * c21d;
c22s = c22d * c22d;
b01s = place[1 - m][jpatch] - place[2][ipatch];
b02s = b01s + size[1 - m][jpatch];
/**/
/* Bump the term by a small real to avoid
/* singularities in coupling function:
/**/
b01s = b01s * b01s + 1e-35;
b02s = b02s * b02s + 1e-35;
d11s = a10s + b01s;
d12s = a10s + b02s;
d21s = a20s + b01s;
d22s = a20s + b02s;
d11d = sqrt(d11s);
d12d = sqrt(d12s);
d21d = sqrt(d21s);
d22d = sqrt(d22s);
d11i = 1.0 / d11d;
d12i = 1.0 / d12d;
d21i = 1.0 / d21d;
d22i = 1.0 / d22d;
tmpa = d11d * ( c11d * atan (c11d * d11i)
- c12d * atan (c12d * d11i)
- c21d * atan (c21d * d11i)
+ c22d * atan (c22d * d11i))
+ d12d * (-c11d * atan (c11d * d12i)
+ c12d * atan (c12d * d12i)
+ c21d * atan (c21d * d12i)
- c22d * atan (c22d * d12i))
+ d21d * (-c11d * atan (c11d * d21i)
+ c12d * atan (c12d * d21i)
+ c21d * atan (c21d * d21i)
- c22d * atan (c22d * d21i))
+ d22d * ( c11d * atan (c11d * d22i)
- c12d * atan (c12d * d22i)
- c21d * atan (c21d * d22i)
+ c22d * atan (c22d * d22i));
e1111 = c11s + d11s;
e1211 = c12s + d11s;
e2111 = c21s + d11s;
e2211 = c22s + d11s;
e1112 = c11s + d12s;
e1212 = c12s + d12s;
e2112 = c21s + d12s;
e2212 = c22s + d12s;
e1121 = c11s + d21s;
e1221 = c12s + d21s;
e2121 = c21s + d21s;
e2221 = c22s + d21s;
e1122 = c11s + d22s;
e1222 = c12s + d22s;
e2122 = c21s + d22s;
e2222 = c22s + d22s;
tmpb = c11s * log( e1111 * e1122 / (e1112 * e1121))
- c12s * log( e1211 * e1222 / (e1212 * e1221))
- c21s * log( e2111 * e2122 / (e2112 * e2121))
+ c22s * log( e2211 * e2222 / (e2212 * e2221))
- d11s * log( e1111 * e2211 / (e1211 * e2111))
+ d12s * log( e1112 * e2212 / (e1212 * e2112))
+ d21s * log( e1121 * e2221 / (e1221 * e2121))
- d22s * log( e1122 * e2222 / (e1222 * e2122));
coeff[ipatch][jpatch] = fabs(4.0 * tmpa + tmpb);
coeff[jpatch][ipatch] = coeff[ipatch][jpatch];
}
}
}
}
}
/*****************************************************************************/
/* This routine sets up the radiosity matrix... normalizes row sums to 1, */
/* and includes terms derived from reflectivites and emissivities of faces. */
/*****************************************************************************/
SetUp3 (npatch, loop, area, rho, emiss, coeff, diag, rhs)
int npatch, /* In: Problem size. */
loop[][2]; /* In: Patch number ranges for faces. */
double area[], /* In: 8 * pi * areas of the patches. */
rho[][3], /* In: (RGB) Reflectivities of the face interiors. */
emiss[][3], /* In: (RGB) Emissivities of the face interiors. */
coeff[][NMAX], /* Out: The coefficients of the eqns to solve. */
diag[][NMAX], /* Out: (RGB) Diagonal terms of the system. */
rhs[][NMAX]; /* Out: (RGB) Right-hand sides of system to solve. */
{
/*
* Local variables:
* iface Loop counter over the number of faces.
* ipatch Outer loop counter over the number of patches.
* j Loop counter over each color (R-G-B).
* jpatch Inner loop counter over the number of patches.
* tmp1 double temporary variable.
* vtmp1-2 double vector temporary variables.
*/
int j, /* (RGB) Loop counter over each color. */
iface, /* Loop counter over the number of faces. */
ipatch, /* Outer loop counter over the number of patches. */
jpatch; /* Inner loop counter over the number of patches. */
double tmp1, /* Double temporary variable. */
vtmp1[3], /* Double vector temporary variables. */
vtmp2[3]; /* Double vector temporary variables. */
/* Ensure that row sums to 1, and put in reflectivities (rho) and */
/* emissivities. */
for (iface = 0 ; iface < 6 ; iface++) {
for (j = 0 ; j < 3 ; j++) {
vtmp1[j] = 1.0 / rho[iface][j];
vtmp2[j] = emiss[iface][j] * vtmp1[j];
}
for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
tmp1 = 0.0;
for (jpatch = 0 ; jpatch < loop[iface][0] ; jpatch++) {
tmp1 += coeff[ipatch][jpatch];
}
for (jpatch = loop[iface][1]+1 ; jpatch < npatch ; jpatch++) {
tmp1 += coeff[ipatch][jpatch];
}
/* Make sure row sum (total form factor) is close to 1: */
if (fabs(tmp1 - area[ipatch]) > (0.5e-9 * tmp1)) {
printf ("Total form factor is too far from unity.\n");
return (FALSE);
}
tmp1 = -tmp1;
/* Set coplanar patch interactions to zero. */
for (jpatch=loop[iface][0] ; jpatch <= loop[iface][1] ; jpatch++) {
coeff[ipatch][jpatch] = 0.0;
}
/* Assign diagonal entries and right-hand sides. */
for (j = 0 ; j < 3 ; j++) {
diag[j][ipatch] = vtmp1[j] * tmp1;
rhs[j][ipatch] = vtmp2[j] * tmp1;
}
}
}
return (TRUE);
}
/*****************************************************************************/
/* This routine factors and backsolves a real, symmetric, near-dense matrix */
/* by LDL factorization. No pivoting; the matrix is diagonally dominant. */
/*****************************************************************************/
void
Solver (npatch, non0, coeff, diag, rhs, result)
int npatch, /* In: Problem size. */
non0; /* In: Index of first nonzero off-diagonal mat. elem.*/
double coeff[][NMAX], /* In/Out: The coefficients of the eqns to solve. */
diag[][NMAX], /* Out: (RGB) Diagonal terms of the system. */
rhs[][NMAX], /* In: (RGB) Right-hand sides of system to solve. */
result[][NMAX]; /* Out: (RGB) solution radiosities. */
{
int i, j, /* General loop counters. */
k, m; /* General loop counters. */
double tmp1; /* Double temporary variable. */
/* Load lower triangle of coefficients, diagonal, and solution vector. */
for (m = 0 ; m < 3 ; m++) {
for (i = non0 ; i < npatch ; i++) {
coeff[i][i] = diag[m][i];
result[m][i] = rhs[m][i];
for (j = 0 ; j < i ; j++) {
coeff[i][j] = coeff[j][i];
}
}
/* Factor matrix, writing factors on top of original matrix. */
for (j = 0 ; j < non0 ; j++) {
coeff[j][j] = 1.0 / diag[m][j];
result[m][j] = rhs[m][j];
}
for (j = non0 ; j < npatch ; j++) {
for (k = non0 ; k < j ; k++) {
coeff[j][k] -= Ddot (k, &coeff[k][0], 1, &coeff[j][0], 1);
}
for (k = 0 ; k < j ; k++) {
tmp1 = coeff[j][k];
coeff[j][k] = tmp1 * coeff[k][k];
coeff[j][j] -= tmp1 * coeff[j][k];
}
coeff[j][j] = 1.0 / coeff[j][j];
}
/* Backsolve, in three stages (for L, D, and L transpose). */
for (k = non0 ; k < npatch ; k++) {
result[m][k] -= Ddot (k, &result[m][0], 1, &coeff[k][0], 1);
}
for (k = 0 ; k < npatch ; k++) {
result[m][k] *= coeff[k][k];
}
for (k = npatch - 2 ; k >= non0 ; k--) {
result[m][k] -= Ddot (npatch-(k+1), &result[m][k+1], 1,
&coeff[k+1][k], NMAX);
}
for (k = non0 - 1 ; k >= 0 ; k--) {
result[m][k] -= Ddot (npatch-non0, &result[m][non0], 1,
&coeff[non0][k], NMAX);
}
}
}
/*****************************************************************************/
/* The following routine writes the answer to secondary storage. */
/*****************************************************************************/
Storer (npatch, loop, place, size, result)
int npatch, /* In: Problem size. */
loop[][2]; /* In: Patch number ranges for faces. */
double result[][NMAX], /* In: (RGB) Radiosity solutions. */
place[][NMAX], /* In: Width-height-depth positions of patches. */
size[][NMAX]; /* In: Width-height sizes of patches. */
{
int i, /* General loop counter. */
iface, /* Loop counter over number of faces. */
ipatch; /* Loop counter of number of patches within a face. */
FILE *outfile; /* Output file pointer. */
/* Write patch geometry to 'answer' file. */
if ((outfile = fopen("answer", "w")) == NULL) {
printf ("Unable to open 'answer' file.\n");
exit (1);
}
fprintf (outfile, "%d patches:\n", npatch);
fprintf (outfile,
" Patch Face Position in w, h, d Width Height\n");
for (iface = 0 ; iface < 6 ; iface++) {
for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
fprintf (outfile,
"%5d %4d%11.5lf%11.5lf%11.5lf %11.5lf%11.5lf\n",
ipatch+1, iface+1,
place[0][ipatch],
place[1][ipatch],
place[2][ipatch],
size[0][ipatch],
size[1][ipatch]);
}
}
/* Write patch radiosities to 'answer' file. */
fprintf (outfile, "\n Patch Face Radiosities\n");
for (iface = 0 ; iface < 6 ; iface++) {
for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
fprintf (outfile, "%5d %4d%12.8lf%12.8lf%12.8lf\n",
ipatch+1, iface+1,
result[0][ipatch],
result[1][ipatch],
result[2][ipatch]);
}
}
fclose(outfile);
}
/*****************************************************************************/
/* This routine verifies that the computed radiosities satisfy the equations.*/
/* */
/* John Gustafson, Diane Rover, Michael Carter, and Stephen Elbert */
/* Ames Laboratory, 3/18/90 */
/*****************************************************************************/
Verify (npatch, coeff, diag, rhs, result)
int npatch; /* In: Problem size. */
double coeff[][NMAX], /* In: The coefficients of the eqns to solve. */
diag[][NMAX], /* In: (RGB) Diagonal terms of the system. */
rhs[][NMAX], /* In: (RGB) Right-hand sides of system to solve. */
result[][NMAX]; /* In: (RGB) Radiosity solutions. */
{
double tmp1, tmp2; /* Double temporary variables. */
double anorm, /* Norm accumulation variable. */
xnorm; /* Norm accumulation variable. */
int i, j, m; /* General loop counters. */
tmp1 = 0.0;
for (m = 0 ; m < 3 ; m++) {
/* Copy lower triangle of coefficients to upper triangle, */
/* and load diagonal. */
for (i = 0 ; i < npatch ; i++) {
coeff[i][i] = diag[m][i];
for (j = 0 ; j < i ; j++) {
coeff[i][j] = coeff[j][i];
}
}
/* Multiply matrix by solution vector, and accum. norm of residual. */
anorm = xnorm = 0.0;
for (j = 0 ; j < npatch ; j++) {
tmp2 = rhs[m][j];
for (i = 0 ; i < npatch ; i++) {
tmp2 -= (coeff[j][i] * result[m][i]);
anorm = MAX(anorm, fabs(coeff[j][i]));
}
xnorm = MAX(xnorm, fabs(result[m][j]));
tmp1 += fabs(tmp2);
}
}
/* printf ("anorm = %g xnorm = %g\n", anorm, xnorm); */
tmp1 /= (anorm * xnorm);
if (tmp1 > 3 * EPS) {
printf ("Residual is too large: %lg\n", tmp1);
return (FALSE);
}
return (TRUE);
}
#ifdef SUN4
/*****************************************************************************/
/* Double precision dot product specifically written for Sun 4/370. */
/* By Michael Carter and John Gustafson, May 30, 1990 */
/* This code unrolls the dot product four ways since that's how many */
/* registers are available on the SPARC. Other RISC system will require */
/* something very similar. Also, unit stride is take advantage of in the */
/* form of special cases. */
/*****************************************************************************/
double
Ddot (n, a, ia, b, ib)
register
int n, /* Number of elements in vectors. */
ia, /* Stride of a vector in ELEMENTS. */
ib; /* Stride of b vector in ELEMENTS. */
register
double *a, /* Pointer to first vector. */
*b; /* Pointer to second vector. */
{
register double sum0 = 0.0,
sum1 = 0.0,
sum2 = 0.0,
sum3 = 0.0;
register int m = n & 3;
int t;
/* The ragged cleanup part. */
while (m--) {
sum0 += *a * *b;
a += ia;
b += ib;
}
/* The fast pipelined part */
n >>= 2;
if (ib == 1 && ia != 1) {
t = ia;
ia = ib;
ib = t;
t = (int) a;
b = a;
a = (double *) t;
}
/* We can optimize if one or more strides are equal to 1. */
if (ia == 1) {
/* This runs if both strides are 1. */
if (ib == 1) {
ia <<= 2;
ib <<= 2;
while (n--) {
sum0 += a[0] * b[0];
sum1 += a[1] * b[1];
sum2 += a[2] * b[2];
sum3 += a[3] * b[3];
a += ia;
b += ib;
}
}
/* This runs if stride of a only is equal to 1. */
else {
ia <<= 2;
while (n--) {
sum0 += a[0] * *b;
b += ib;
sum1 += a[1] * *b;
b += ib;
sum2 += a[2] * *b;
b += ib;
sum3 += a[3] * *b;
a += ia;
b += ib;
}
}
}
/* This runs for the more general case. */
/* This is about .5 MFLOPS slower on Sun 4/370 */
else {
while (n--) {
sum0 += *a * *b;
a += ia;
b += ib;
sum1 += *a * *b;
a += ia;
b += ib;
sum2 += *a * *b;
a += ia;
b += ib;
sum3 += *a * *b;
a += ia;
b += ib;
}
}
return (sum0 + sum1 + sum2 + sum3);
}
#else
/*****************************************************************************/
/* Generic double-precision dot product. Unrolling will help pipelined */
/* computers. Modify accordingly. */
/*****************************************************************************/
double
Ddot (n, a, ia, b, ib)
register
int n, /* Number of elements in vectors. */
ia, /* Stride of a vector in ELEMENTS. */
ib; /* Stride of b vector in ELEMENTS. */
register
double *a, /* Pointer to first vector. */
*b; /* Pointer to second vector. */
{
register double sum = 0.0;
while (n--) {
sum += *a * *b;
a += ia;
b += ib;
}
return (sum);
}
#endif
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
/*
timer program -- computes total time in seconds
since the first call. Uses constant CLOCK_RATE
to compute of CPU time in seconds
*/
/* Unix clock */
#define CLOCK_RATE 1000000.0
/* MS-DOS Turbo C
#define CLOCK_RATE CLK_TCK
*/
float second(void);
float second()
{
return((float)clock() / CLOCK_RATE);
}
/* C-style global parameters */
float T,T1,T2,E1[4];
int J,K,L;
void POUT(long n, long j, long k, float x1, float x2, float x3, float x4)
{
printf("\n %7.1ld%7.1ld%7.1ld%12.4e%12.4e%12.4e%12.4e%8.2f",
n,j,k,x1,x2,x3,x4,second());
}
void PA(E)
float *E;
{
int j;
j=0;
do {
E[0]=(E[0]+E[1]+E[2]-E[3])*T;
E[1]=(E[0]+E[1]-E[2]+E[3])*T;
E[2]=(E[0]-E[1]+E[2]+E[3])*T;
E[3]=(-E[0]+E[1]+E[2]+E[3])/T2;
j=j+1;
}
while(j<6);
}
void P0()
{
E1[J-1]=E1[K-1];
E1[K-1]=E1[L-1];
E1[L-1]=E1[J-1];
}
void P3(X, Y, Z)
float *X, *Y, *Z;
{
float X1, Y1;
X1=*X;
Y1=*Y;
X1=T*(X1+Y1);
Y1=T*(X1+Y1);
*Z=(X1+Y1)/T2;
}
/* equivalent description of FORTRAN-style common block ( slow !) */
/*
struct _comm_blk_ {
float _T, _T1, _T2, _E1[4];
int _J,_K,_L;
} common;
#define T common._T
#define T1 common._T1
#define T2 common._T2
#define E1 common._E1
#define J common._J
#define K common._K
#define L common._L
*/
int main()
{
float X1,X2,X3,X4,X,Y,Z;
long I,ISAVE,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12;
printf("Start timing.");
I = 10;
T1=0.50025000;
T=0.499975000;
T2=2.0000;
ISAVE=I;
N1=0;
N2=12*I;
N3=14*I;
N4=348*I;
N5=0;
N6=210*I;
N7=32*I;
N8=899*I;
N9=516*I;
N10=0;
N11=93*I;
N12=0;
X1=1.0;
X2=-1.0;
X3=-1.0;
X4=-1.;
for(I=0; I<N1; I++)
{
X1=(X1+X2+X3-X4)*T;
X2=(X1+X2-X3+X4)*T;
X4=(-X1+X2+X3+X4)*T;
X3=(X1-X2+X3+X4)*T;
}
POUT(N1,N1,N1,X1,X2,X3,X4);
E1[0]=1.0;
E1[1]=-1.0;
E1[2]=-1.0;
E1[3]=-1.0;
for(I=0; I<N2; I++)
{
E1[0]=(E1[0]+E1[1]+E1[2]-E1[3])*T;
E1[1]=(E1[0]+E1[1]-E1[2]+E1[3])*T;
E1[2]=(E1[0]-E1[1]+E1[2]+E1[3])*T;
E1[3]=(-E1[0]+E1[1]+E1[2]+E1[3])*T;
}
POUT(N2,N3,N2,E1[0],E1[1],E1[2],E1[3]);
for(I=0; I<N3; I++) PA(E1);
POUT(N3,N2,N2,E1[0],E1[1],E1[2],E1[3]);
J=1;
for(I=0; I<N4; I++)
{
if(J==1) J=2;
else J=3;
if(J<2) J=0;
else J=1;
if(J<1) J=1;
else J=0;
}
POUT(N4,J,J,X1,X2,X3,X4);
J=1;
K=2;
L=3;
for(I=0; I<N6; I++)
{
J=J*(K-J)*(L-K);
K=L*K-(L-J)*K;
L=(L-K)*(K+J);
E1[L-2]=J+K+L;
E1[K-2]=J*K*L;
}
POUT(N6,(long)J,(long)K,E1[0],E1[1],E1[2],E1[3]);
X=0.5;
Y=0.5;
{
register float x=X;
register float y=Y;
register float t2=T2;
register float t=T;
for(I=0; I<N7; I++)
{
x=t*atan(t2*sin(x)*cos(x)/(cos(x+y)+cos(x-y)-1.0));
y=t*atan(t2*sin(y)*cos(y)/(cos(x+y)+cos(x-y)-1.0));
}
X=x; Y=y;
}
POUT(N7,(long)J,(long)K,X,X,Y,Y);
X=1.0;
Y=1.0;
Z=1.0;
for(I=0; I<N8; I++) P3(&X,&Y,&Z);
POUT(N8,(long)J,(long)K,X,Y,Z,Z);
J=1;
K=2;
L=3;
E1[0]=1.0;
E1[1]=2.0;
E1[2]=3.0;
for(I=0; I<N9; I++) P0();
POUT(N9,(long)J,(long)K,E1[0],E1[1],E1[2],E1[3]);
J=2;
K=3;
for(I=0; I<N10; I++)
{
J+=K;
K+=J;
J-=K;
K-=J+J;
}
POUT(N10,(long)J,(long)K,X1,X2,X3,X4);
X=0.75;
{
register float x=X;
register float t1=T1;
for(I=0; I<N11; I++) x=sqrt(exp(log(x)/t1));
X=x;
}
POUT(N11,(long)J,(long)K,X,X,X,X);
printf("\n %g whetstones per second\n", 1.0e+08/second());
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment