slalom.c 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138
  1. /******************************************************************************
  2. S L A L O M
  3. Scalable Language-independent Ames Laboratory One-minute Measurement
  4. The following program is the first benchmark based on fixed time rather
  5. than fixed problem comparison. Not only is fixed time more representative
  6. of the way people use computers, it also greatly increases the scope and
  7. longevity of the benchmark. SLALOM is very scalable, and can be used to
  8. compare computers as slow as 126 floating-point operations per second
  9. (FLOPS) to computers running a trillion times faster. The scalability can
  10. be used to compare single processors to massively parallel collections
  11. of processors, and to study the space of problem size vs. ensemble size
  12. in fine detail. It resembles the LINPACK benchmark since it involves
  13. factoring and backsolving a (nearly) dense matrix, but incorporates a
  14. number of improvements to that benchmark that we hope will make SLALOM
  15. a better reflection of general system performance.
  16. The SLALOM benchmark solves a complete, real problem (optical radiosity
  17. on the interior of a box), not a contrived kernel or a synthetic mixture of
  18. sample operations. SLALOM is unusual since it times input, problem setup,
  19. solution, and output, not just the solution. For slower computers, the
  20. problem setup will take the majority of the time; it grows as the square of
  21. the problem size. The solver grows as the cube of the problem size, and
  22. dominates the time for large values of n.
  23. While the following is C, you are free to translate it into any
  24. language you like, including assembly language specific to one computer.
  25. You may use compiler directives, hand-tuned library calls, loop unrolling,
  26. and even change the algorithm, if you can provide a convincing argument
  27. that the program still works for the full range of possible inputs. For
  28. example, if you replace the direct solver with an iterative one, you must
  29. make sure your method is correct even when the geometry is quite eccentric
  30. and the box faces are highly reflective. (rho = .999)
  31. The main() driver should be used with the value of 60 seconds for the
  32. SLALOM benchmark. The work done for a particular problem size is figured
  33. after timing has ceased, so there is no overhead for work assessment. The
  34. residual check ||Ax - b|| is also done after timing has ceased. Two
  35. computers may be compared either by their problem size n, or by their MFLOPS
  36. rate, never by the ratio of execution times. Times will always be near one
  37. minute in SLALOM. We have used the following weights for floating-point
  38. operation counting, based on the weights used by Lawrence Livermore National
  39. Laboratory:
  40. OPERATION WEIGHT
  41. a=b, a=(constant) 0
  42. a<0, a<=0, a==0, a!=0, a>0, a>=0 0
  43. -a, fabs(a), fsgn(a, b) 0
  44. a+b, a-b, a*b, a^2 1
  45. a<b, a<=b, a==b, a!=b, a>b, a>=b 1
  46. (int) a, (double)b 1
  47. 1/a, -1/a 3
  48. a/b 4
  49. sqrt(a) 4
  50. Format to or from ASCII string 6
  51. sin(a), cos(a), tan(a), log(a), atan(a), exp(a) 8
  52. We invite you to share with us the results of any measurements that you
  53. make with SLALOM. We do NOT accept anonymous data; machine timings will be
  54. referenced and dated.
  55. The least you need to do to adapt SLALOM to your computer is:
  56. 1. In the "Measure" routine, set NMAX to a value large enough to keep
  57. the computer working for a minute. Vary it slightly if it helps
  58. (for reasons of cache size, interleaving, etc.)
  59. 2. Replace the timer call in "When" with the most accurate wall-clock
  60. timer at your disposal. If only CPU time is available, try to run
  61. the job standalone or at high priority, since we are ultimately
  62. interested in the top of the statistical range of performance.
  63. 3. Edit in the information specific to your test in the "What"
  64. routine, so that final output will be automatically annotated.
  65. 4. Compile, link, and run the program, interacting to select values
  66. of n that bracket a time of one minute. Once everything is
  67. running, run it as a batch job so as to record the session.
  68. Examples of ways you may optimize performance:
  69. 1. Unroll the loops in SetUp1 and SetUp2; it is possible to
  70. vectorize both SetUp1 and SetUp2 at the cost of some extra
  71. operations, program complexity, and storage.
  72. 2. Replace the innermost loops of Solver with calls to well-tuned
  73. libraries of linear algebra routines, such as DDOT from the
  74. Basic Linear Algebra Subroutines (level 1 BLAS). Better still,
  75. use a tuned library routine for all of Solver; the sparsity
  76. exploited in Solver is only a few percent, so you will usually
  77. gain more than you lose by applying a dense symmetric solver.
  78. 3. Parallelize the SetUp and Solver routines; all are highly
  79. parallel. Each element of the matrix can be constructed
  80. independently, once each processor knows the geometry and part of
  81. the partitioning into regions. A substantial body of literature
  82. now exists for performing the types of operations in Solver in
  83. parallel.
  84. 4. Overlap computation with output. Once the Region routine is done,
  85. the first part of the output file (patch geometry) can be written
  86. while the radiosities are being calculated.
  87. Examples of what you may NOT do:
  88. 1. The tuning must not be made specific to the particular input
  89. provided. For example, you may not eliminate IF tests simply
  90. because they always come out the same way for this input; you
  91. may not use precomputed answers or table look-up unless those
  92. answers and tables cover the full range of possible inputs; and
  93. you may not exploit symmetry for even values of the problem size.
  94. 2. You may not disable the self-consistency tests in SetUp3 and
  95. Verify, nor alter their tolerance constants.
  96. 3. You may not change the input or output files to unformatted
  97. binary or other format that would render them difficult to create
  98. or read for humans.
  99. 4. You may not eliminate the reading of the "geom" file by putting
  100. its data directly into the compiled program.
  101. 5. You may not change any of the work assessments in Meter. If you
  102. use more floating-point operations than indicated, you must still
  103. use the assessments provided. If you find a way to use fewer
  104. operations and still get the job done for arbitrary input
  105. parameters, please tell us!
  106. -John Gustafson, Diane Rover, Michael Carter,
  107. and Stephen Elbert
  108. Ames Laboratory, Ames, Iowa 50011
  109. ******************************************************************************/
  110. /*****************************************************************************/
  111. /* The following program finds a value n such that a problem of size n */
  112. /* takes just under "goal" seconds to execute. */
  113. /* */
  114. /* John Gustafson, Diane Rover, Michael Carter, and Stephen Elbert */
  115. /* Ames Laboratory, 3/18/90 */
  116. /* */
  117. /* Calls: Meter Measures execution time for some application. */
  118. /* What Prints work-timing statistics and system information. */
  119. /*****************************************************************************/
  120. #include <stdio.h>
  121. #include <math.h>
  122. #include <sys/time.h>
  123. /* NMAX = Largest npatch for your computer; adjust as needed. */
  124. #define NMAX 2048
  125. #define EPS (0.5e-8)
  126. #define FALSE (1==0)
  127. #define TRUE (!FALSE)
  128. #define MAX(a,b) (((a) > (b)) ? (a) : (b))
  129. /* Global variables and function return types: */
  130. double goal, /* User input, fixed-time benchmark goal, in seconds. */
  131. timing, /* Elapsed time returned by Meter routine, in seconds.*/
  132. work, /* In this case, number of FLOPs performed. */
  133. When(), /* Wall clock in seconds. */
  134. Ddot(); /* Double dot product. */
  135. int mean, /* Avg between upper and lower bounds for bisection */
  136. /* method. */
  137. n, /* The problem size. */
  138. nupper, /* Upper bound on problem size, used in iterating */
  139. /* toward goal. */
  140. Meter(), /* Driver for following benchmark functions. */
  141. Reader (), /* Reads problem description from 'geom' file. */
  142. Region (), /* Subdivides box faces into patches. */
  143. SetUp3 (), /* Set up matrix to solve. */
  144. Storer (), /* Write result to 'answer' file. */
  145. Verify (); /* Verify the radiosity solution from solver. */
  146. void SetUp1 (), /* Set up matrix to solve. */
  147. SetUp2 (), /* Set up matrix to solve. */
  148. Solver (); /* Solve the radiosity matrix. */
  149. main ()
  150. {
  151. int ok; /* Return code temporary storage. */
  152. /* Get desired number of seconds: */
  153. printf ("Enter the number of seconds that is the goal: ");
  154. scanf ("%lg", &goal);
  155. /* Get lower and upper bounds for n from the standard input device: */
  156. do {
  157. printf ("Enter a lower bound for n: ");
  158. scanf ("%d", &n);
  159. if (n <= 0)
  160. exit(0);
  161. ok = Meter (n, &timing, &work);
  162. if (timing >= goal)
  163. printf ("Must take less than %g seconds. Took %g.\n",
  164. goal, timing);
  165. } while (!ok || timing >= goal);
  166. do {
  167. printf ("Enter an upper bound for n: ");
  168. scanf ("%d", &nupper);
  169. if (nupper <= 0)
  170. exit(0);
  171. ok = Meter (nupper, &timing, &work);
  172. if (timing < goal) {
  173. printf ("Must take at least %g seconds. Took %g.\n",
  174. goal, timing);
  175. n = MAX(nupper, n);
  176. }
  177. } while (!ok || timing < goal);
  178. /*
  179. * While the [n, nupper] interval is larger than 1, bisect it and
  180. * pick a half:
  181. */
  182. while (nupper - n > 1) {
  183. mean = (n + nupper) / 2;
  184. ok = Meter (mean, &timing, &work);
  185. if (timing < goal)
  186. n = mean;
  187. else
  188. nupper = mean;
  189. printf ("New interval: [%d,%d]\n", n, nupper);
  190. }
  191. /* Ensure that most recent run was for n, not nupper. */
  192. ok = Meter (n, &timing, &work);
  193. /* Print out final statistics. */
  194. What (n, timing, work);
  195. }
  196. /*****************************************************************************/
  197. /* This routine should be edited to contain information for your system. */
  198. /*****************************************************************************/
  199. What (n, timing, work)
  200. int n;
  201. double timing, work;
  202. {
  203. int i;
  204. static char *info[] = {
  205. "Machine: SUN 4/370GX Processor: SPARC",
  206. "Memory: 32 MB # of procs: 1",
  207. "Cache: 128 KB # used: 1",
  208. "NMAX: 512 Clock: 25 MHz",
  209. "Disk: .3GB SCSI+.7GB SMD Node name: amssun2",
  210. "OS: SUNOS 4.0.3 Timer: Wall, gettimeofday()",
  211. "Language: C Alone: yes",
  212. "Compiler: cc Run by: M. Carter",
  213. "Options: -O Date: 23 May 1990",
  214. NULL
  215. };
  216. printf ("\n");
  217. for (i = 0 ; info[i] ; i++)
  218. puts (info[i]);
  219. printf ("M ops: %-13lg Time: %-.3lf seconds\n",
  220. work * 1e-6, timing);
  221. printf ("n: %-6d MFLOPS: %-.5lg\n",
  222. n, (work / timing) * 1e-6);
  223. printf ("Approximate data memory use: %d bytes.\n",
  224. 8 * n * n + 120 * n + 800);
  225. }
  226. /*****************************************************************************/
  227. /* This routine measures time required on a revised LINPACK-type benchmark, */
  228. /* including input, matrix generation, solution, and output. */
  229. /* */
  230. /* John Gustafson, Diane Rover, Michael Carter, and Stephen Elbert */
  231. /* Ames Laboratory, 3/18/90 */
  232. /* */
  233. /* Calls: Reader Reads the problem description from secondary storage. */
  234. /* Region Partitions box surface into rectangular regions (patches).*/
  235. /* SetUp1 Sets up equations from patch geometries-parallel faces. */
  236. /* SetUp2 Sets up equations from patch geometries-orthogonal faces. */
  237. /* SetUp3 Sets up equations-row normalization and radiant props. */
  238. /* Solver Solves the equations by LDL factorization. */
  239. /* Storer Stores solution (patch radiosities) on secondary storage. */
  240. /* When Returns wall-clock time, in seconds. */
  241. /*****************************************************************************/
  242. Meter (npatch, timing, work)
  243. int npatch; /* In, problem size, here the number of equations. */
  244. double *timing, /* Out, number of seconds measured. */
  245. *work; /* Out, work done, here the number of FLOPs. */
  246. {
  247. static
  248. double area[NMAX], /* Areas of patches * 8 * pi. */
  249. box[7], /* Dimensions of box in x, y, z directions. */
  250. coeff[NMAX][NMAX], /* The coefficients of the eqns to solve. */
  251. diag[3][NMAX], /* Diag terms of the eqns to solve. (RGB) */
  252. emiss[6][3], /* (RGB) emissivities of patches. */
  253. place[3][NMAX], /* Width-height-depth position of patches. */
  254. result[3][NMAX], /* Answer radiosities (RGB). */
  255. rho[6][3], /* (RGB) Reflectivities of patches. */
  256. rhs[3][NMAX], /* Right-hand sides of eqns to solve (RGB). */
  257. size[2][NMAX]; /* Width-height sizes of patches. */
  258. double ops[8], /* Floating-point operation counts. */
  259. p[6], /* Number of patches in faces. */
  260. sec[8], /* Times for routines, in seconds. */
  261. tmp1, tmp2; /* Double temporary variables. */
  262. int i, /* Loop counter. */
  263. itmp1, /* Integer temporary variable. */
  264. non0; /* Index of first nonzero off-diagonal elem. */
  265. static
  266. int loop[6][2]; /* Patch number ranges for faces. */
  267. static char *tasks[] = { /* Names of all the functions in benchmark. */
  268. "Reader", "Region",
  269. "SetUp1", "SetUp2",
  270. "SetUp3", "Solver",
  271. "Storer"
  272. };
  273. static char *format = /* Output line format. */
  274. "%6.6s%8.3f%17.0f%14.6f%10.1f %%\n";
  275. /* First check that npatch lies between 6 and NMAX: */
  276. if (npatch < 6) {
  277. printf ("Must be at least 6, the number of faces.\n");
  278. return (FALSE);
  279. }
  280. else if (npatch > NMAX) {
  281. printf ("Exceeds %d = maximum for this system.\n", NMAX);
  282. return (FALSE);
  283. }
  284. /* Ensure that previous 'answer' file is deleted: */
  285. unlink ("answer");
  286. /* Time the tasks, individually and collectively. */
  287. sec[0] = When();
  288. if (!Reader (box, rho, emiss))
  289. return (FALSE);
  290. sec[1] = When();
  291. if (!Region (npatch, loop, box, place, size, area))
  292. return (FALSE);
  293. sec[2] = When();
  294. SetUp1 (npatch, loop, coeff, place, size);
  295. sec[3] = When();
  296. SetUp2 (npatch, loop, coeff, place, size);
  297. sec[4] = When();
  298. if (!SetUp3 (npatch, loop, area, rho, emiss, coeff, diag, rhs))
  299. return (FALSE);
  300. sec[5] = When();
  301. non0 = loop[1][0];
  302. Solver (npatch, non0, coeff, diag, rhs, result);
  303. sec[6] = When();
  304. Storer (npatch, loop, place, size, result);
  305. sec[7] = When();
  306. *timing = sec[7] - sec[0];
  307. for (i = 0 ; i < 7 ; i++)
  308. sec[i] = sec[i+1] - sec[i];
  309. /* Assess floating-point work done by each routine called, and total: */
  310. /* Note the ops counts are talleyed into a double array, and there */
  311. /* some strange casts to double in some equations. This is to */
  312. /* prevent integer overflow. */
  313. itmp1 = 0;
  314. tmp1 = 0.0;
  315. for (i = 0 ; i < 6 ; i++) {
  316. p[i] = loop[i][1] - loop[i][0] + 1;
  317. tmp1 += p[i] * p[i];
  318. itmp1 += sqrt(p[i] * box[i] / box[i + 1]) + 0.5;
  319. }
  320. tmp2 = p[0] * p[3] + p[1] * p[4] + p[2] * p[5];
  321. ops[0] = 258;
  322. ops[1] = 154 + (double) 8 * itmp1 + npatch;
  323. ops[2] = 6 + 532 * tmp2;
  324. ops[3] = 8*npatch + 370 * ((double) npatch * npatch - tmp1 - 2*tmp2) / 2.0;
  325. ops[4] = 72 + (double) 9 * npatch + (double) npatch * npatch - tmp1;
  326. ops[5] = npatch * (npatch * ((double) npatch + 7.5) - 2.5) - 21
  327. + (non0+1) * ((non0+1) * (2 * ((double) non0+1) - 16.5) + 35.5)
  328. + (non0+1) * npatch * (9 - 3 * ((double) non0+1));
  329. ops[6] = 48 * npatch;
  330. *work = ops[0] + ops[1] + ops[2] + ops[3] + ops[4] + ops[5] + ops[6];
  331. /* Display timing-work-speed breakdown by routine. */
  332. printf ("%d patches:\n", npatch);
  333. printf (" Task Seconds Operations MFLOPS %% of Time\n");
  334. for (i = 0 ; i < 7 ; i++) {
  335. if (sec[i] == 0.0)
  336. sec[i] = 0.001;
  337. printf (format, tasks[i], sec[i], ops[i], (ops[i] / sec[i]) * 1e-6,
  338. 100.0 * sec[i] / *timing);
  339. }
  340. printf (format, "TOTALS", *timing, *work, (*work / *timing) * 1e-6, 100.0);
  341. Verify (npatch, coeff, diag, rhs, result);
  342. return (TRUE);
  343. }
  344. /*****************************************************************************/
  345. /* This function should return the actual, wall clock time (not CPU time) */
  346. /* in seconds as accurately as possible. Change it to your system timer. */
  347. /*****************************************************************************/
  348. double
  349. When()
  350. {
  351. struct timeval tp;
  352. struct timezone tzp;
  353. gettimeofday (&tp, &tzp);
  354. return ((double) tp.tv_sec + (double) tp.tv_usec * 1e-6);
  355. }
  356. /*****************************************************************************/
  357. /* The following routine reads in the problem description from secondary */
  358. /* storage, and checks that numbers are in reasonable ranges. */
  359. /*****************************************************************************/
  360. Reader (box, rho, emiss)
  361. double box[], /* Out: Dimensions of box in x, y, z directions. */
  362. rho[][3], /* Out: (RGB) Reflectivities of patches. */
  363. emiss[][3]; /* Out: (RGB) emissivities of patches. */
  364. {
  365. /*
  366. * Local variables:
  367. * infile Device number for input file.
  368. * i, j Loop counters.
  369. * tmp1 Maximum emissivity, to check that emissivities are not all 0.
  370. */
  371. int i, j, /* Loop counters. */
  372. n; /* Number of args fscanf()'ed from file. */
  373. double tmp1; /* Maximum emissivity. */
  374. FILE *infile; /* Input file pointer. */
  375. char buff[81]; /* Buffer used to eat a line of input. */
  376. /* Open the input file and read in the data. */
  377. if ((infile = fopen ("geom", "r")) == NULL) {
  378. printf ("slalom: 'geom' geometry file not found.\n");
  379. exit (1);
  380. }
  381. /* Read the box coordinates and error check. */
  382. n = 0;
  383. for (i = 0 ; i < 3 ; i++) {
  384. n += fscanf (infile, "%lg", &box[i]);
  385. }
  386. fgets (buff, 80, infile); /* Eat the rest of the line. */
  387. if (n != 3) {
  388. printf ("Must specify exactly 3 box coordinates.\n");
  389. exit(1);
  390. }
  391. /* Read the reflectivities and error check. */
  392. n = 0;
  393. for (j = 0 ; j < 3 ; j++) {
  394. for (i = 0 ; i < 6 ; i++) {
  395. n += fscanf (infile, "%lg", &rho[i][j]);
  396. }
  397. }
  398. fgets (buff, 80, infile); /* Eat the rest of the line. */
  399. if (n != 18) {
  400. printf ("Must specify exactly 18 box coordinates.\n");
  401. exit(1);
  402. }
  403. /* Read the emissivities and error check. */
  404. n = 0;
  405. for (j = 0 ; j < 3 ; j++) {
  406. for (i = 0 ; i < 6 ; i++) {
  407. n += fscanf (infile, "%lg", &emiss[i][j]);
  408. }
  409. }
  410. fgets (buff, 80, infile); /* Eat the rest of the line. */
  411. if (n != 18) {
  412. printf ("Must specify exactly 18 box coordinates.\n");
  413. exit(1);
  414. }
  415. fclose (infile);
  416. /* Now sanity-check the values that were just read. */
  417. for (j = 0 ; j < 3 ; j++) {
  418. if (box[j] < 1.0 || box[j] >= 100.0) {
  419. printf ("Box dimensions must be between 1 and 100.\n");
  420. return (FALSE);
  421. }
  422. box[j+3] = box[j];
  423. tmp1 = 0.0;
  424. for (i = 0 ; i < 6 ; i++) {
  425. if (rho[i][j] < 0.000 || rho[i][j] > 0.999) {
  426. printf ("Reflectivities must be between .000 and .999.\n");
  427. return (FALSE);
  428. }
  429. if (emiss[i][j] < 0.0) {
  430. printf ("Emissivity cannot be negative.\n");
  431. return (FALSE);
  432. }
  433. if (tmp1 < emiss[i][j])
  434. tmp1 = emiss[i][j];
  435. }
  436. if (tmp1 == 0.0) {
  437. printf ("Emissivities are zero. Problem is trivial.\n");
  438. return (FALSE);
  439. }
  440. }
  441. box[6] = box[3];
  442. return (TRUE);
  443. }
  444. /*****************************************************************************/
  445. /* The following routine decomposes the surface of a variable-sized box */
  446. /* into patches that are as nearly equal in size and square as possible. */
  447. /*****************************************************************************/
  448. Region (npatch, loop, box, place, size, area)
  449. int npatch, /* In: Problem size. */
  450. loop[][2]; /* Out: Patch number ranges for faces. */
  451. double area[], /* Out: 8pi * areas of the patches. */
  452. box[], /* In: Dimensions of box in x, y, z directions. */
  453. place[][NMAX], /* Out: Width-height-depth positions of patches. */
  454. size[][NMAX]; /* Out: Width-height sizes of patches. */
  455. {
  456. int icol, /* Loop counter over the number of columns. */
  457. ipatch, /* Loop counter over the number of patches. */
  458. iface, /* Loop counter over the number of faces. */
  459. itmp1, /* Integer temporary variables. */
  460. itmp2, /* Integer temporary variables. */
  461. last, /* Inner loop ending value. */
  462. lead, /* Inner loop starting value. */
  463. numcol, /* Number of columns on faces. */
  464. numpat, /* Number of patches on a face. */
  465. numrow; /* Number of rows of patches in a column. */
  466. double height, /* Height of a patch within a column. */
  467. tmp1, /* double temporary variables. */
  468. tmp2, /* double temporary variables. */
  469. tmp3, /* double temporary variables. */
  470. tmp4, /* double temporary variables. */
  471. width; /* Width of a column of patches. */
  472. /* Allocate patches to each face, proportionate to area of each face. */
  473. tmp1 = 2.0 * (box[0] * box[1] + box[1] * box[2] + box[2] * box[0]);
  474. tmp2 = 0.0;
  475. tmp3 = npatch;
  476. loop[0][0] = 0;
  477. for (iface = 0 ; iface < 5 ; iface++) {
  478. tmp2 = tmp2 + box[iface] * box[iface + 1];
  479. loop[iface][1] = (int) (tmp3 * tmp2 / tmp1 + 0.5) - 1;
  480. loop[iface + 1][0] = loop[iface][1] + 1;
  481. }
  482. loop[5][1] = npatch - 1;
  483. /* Subdivide each face into numpat patches. */
  484. for (iface = 0 ; iface < 6 ; iface++) {
  485. numpat = loop[iface][1] - loop[iface][0] + 1;
  486. tmp3 = 0.0;
  487. if (iface >= 3)
  488. tmp3 = box[iface-1];
  489. numcol = (int) (sqrt(numpat * box[iface] / box[iface + 1]) + 0.5);
  490. if (numcol > numpat)
  491. numcol = numpat;
  492. if (numcol == 0)
  493. numcol = 1;
  494. width = box[iface] / numcol;
  495. itmp1 = numcol - 1;
  496. tmp1 = 0.0;
  497. for (icol = 0 ; icol < numcol ; icol++) {
  498. itmp2 = itmp1 / numcol;
  499. numrow = (itmp1 + numpat) / numcol - itmp2;
  500. if (numrow == 0) {
  501. printf ("Eccentric box requires more patches.\n");
  502. return (FALSE);
  503. }
  504. height = box[iface + 1] / numrow;
  505. tmp2 = 0.0;
  506. tmp4 = width * height * (8.0 * M_PI);
  507. lead = loop[iface][0] + itmp2;
  508. last = lead + numrow;
  509. for (ipatch = lead ; ipatch < last ; ipatch++) {
  510. size[0][ipatch] = width;
  511. size[1][ipatch] = height;
  512. place[0][ipatch] = tmp1;
  513. place[1][ipatch] = tmp2;
  514. place[2][ipatch] = tmp3;
  515. area[ipatch] = tmp4;
  516. tmp2 = tmp2 + height;
  517. }
  518. tmp1 = tmp1 + width;
  519. itmp1 = itmp1 + numpat;
  520. }
  521. }
  522. return (TRUE);
  523. }
  524. /*****************************************************************************/
  525. /* This routine sets up the radiosity matrix for parallel patches. */
  526. /*****************************************************************************/
  527. void
  528. SetUp1 (npatch, loop, coeff, place, size)
  529. int npatch, /* In: Problem size. */
  530. loop[][2]; /* In: Patch number ranges for faces. */
  531. double coeff[][NMAX], /* Out: The coefficients of the eqns to solve. */
  532. place[][NMAX], /* In: Width-height-depth positions of patches. */
  533. size[][NMAX]; /* In: Width-height sizes of patches. */
  534. {
  535. int i, j, k, /* General loop counters. */
  536. m, n, /* General loop counters. */
  537. iface, /* Loop counter over the number of faces. */
  538. ipatch, /* Loop counter over the number of patches. */
  539. jface, /* Face coupled to iface when computing mat. elems. */
  540. jpatch; /* Patch coupled to ipatch when computing mat. elems.*/
  541. double d[2][2][2], /* Point-to-point couplings between patch corners. */
  542. d2[2][2][2],/* Squares of d values, to save recomputation. */
  543. tmp1, tmp2, /* Double temporary variables. */
  544. tmp3, tmp4, /* Double temporary variables. */
  545. tmp5, tmp6, /* Double temporary variables. */
  546. tmp7, tmp8; /* Double temporary variables. */
  547. for (iface = 0 ; iface < 3 ; iface++) {
  548. jface = iface + 3;
  549. tmp1 = place[2][loop[jface][0]] * place[2][loop[jface][0]];
  550. tmp6 = tmp1 + tmp1;
  551. for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
  552. for (jpatch=loop[jface][0] ; jpatch <= loop[jface][1] ; jpatch++) {
  553. for (j = 0 ; j < 2 ; j++) {
  554. d [0][0][j] = place[j][jpatch] - place[j][ipatch];
  555. d [1][0][j] = d[0][0][j] + size[j][jpatch];
  556. d [0][1][j] = d[0][0][j] - size[j][ipatch];
  557. d [1][1][j] = d[1][0][j] - size[j][ipatch];
  558. d2[0][0][j] = d[0][0][j] * d[0][0][j];
  559. d2[1][0][j] = d[1][0][j] * d[1][0][j];
  560. d2[0][1][j] = d[0][1][j] * d[0][1][j];
  561. d2[1][1][j] = d[1][1][j] * d[1][1][j];
  562. }
  563. tmp2 = 0.0;
  564. for (m = 0 ; m < 2 ; m++) {
  565. for (i = 0 ; i < 2 ; i++) {
  566. tmp3 = d2[m][i][1] + tmp1;
  567. tmp4 = sqrt(tmp3);
  568. tmp5 = 1.0 / tmp4;
  569. tmp8 = 0.0;
  570. for (k = 0 ; k < 2 ; k++) {
  571. for (n = 0 ; n < 2 ; n++) {
  572. tmp7 = d[k][n][0];
  573. tmp8 = -tmp7 * atan(tmp7 * tmp5) - tmp8;
  574. }
  575. tmp8 = -tmp8;
  576. }
  577. tmp2 = -4.0 * tmp4 * tmp8 - tmp2 - tmp6 *
  578. log(((d2[1][0][0] + tmp3) * (d2[0][1][0] + tmp3)) /
  579. ((d2[0][0][0] + tmp3) * (d2[1][1][0] + tmp3)));
  580. }
  581. tmp2 = -tmp2;
  582. }
  583. for (m = 0 ; m < 2 ; m++) {
  584. for (i = 0 ; i < 2 ; i++) {
  585. tmp4 = sqrt(d2[m][i][0] + tmp1);
  586. tmp5 = 1.0 / tmp4;
  587. tmp8 = 0.0;
  588. for (k = 0 ; k < 2 ; k++) {
  589. for (n = 0 ; n < 2 ; n++) {
  590. tmp7 = d[k][n][1];
  591. tmp8 = -tmp7 * atan(tmp7 * tmp5) - tmp8;
  592. }
  593. tmp8 = -tmp8;
  594. }
  595. tmp2 = -4.0 * tmp4 * tmp8 - tmp2;
  596. }
  597. tmp2 = -tmp2;
  598. }
  599. coeff[ipatch][jpatch] = tmp2;
  600. coeff[jpatch][ipatch] = tmp2;
  601. }
  602. }
  603. }
  604. }
  605. /*****************************************************************************/
  606. /* This routine sets up the radiosity matrix for orthogonal patches. */
  607. /*****************************************************************************/
  608. void
  609. SetUp2 (npatch, loop, coeff, place, size)
  610. int npatch, /* In: Problem size. */
  611. loop[][2]; /* In: Patch number ranges for faces. */
  612. double coeff[][NMAX], /* Out: The coefficients of the eqns to solve. */
  613. place[][NMAX], /* In: Width-height-depth positions of patches. */
  614. size[][NMAX]; /* In: Width-height sizes of patches. */
  615. {
  616. int m, /* General loop counters. */
  617. iface, /* Loop counter over the number of faces. */
  618. ipatch, /* Loop counter over the number of patches. */
  619. jface, /* Face coupled to iface when computing mat. elems. */
  620. jpatch; /* Patch coupled to ipatch when computing mat. elems.*/
  621. double tmpb, tmpa,
  622. c11d, c12d, c21d, c22d, c11s, c12s, c21s, c22s,
  623. d11d, d12d, d21d, d22d, d11s, d12s, d21s, d22s,
  624. d11i, d12i, d21i, d22i, a10s, a20s, b01s, b02s,
  625. e1111, e1211, e2111, e2211, e1112, e1212, e2112, e2212,
  626. e1121, e1221, e2121, e2221, e1122, e1222, e2122, e2222;
  627. for (iface = 0 ; iface < 6 ; iface++) {
  628. for (m = 0 ; m < 2 ; m++) {
  629. jface = (iface + m + 1) % 6;
  630. for (ipatch=loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
  631. a10s = place[m][ipatch] - place[2][loop[jface][0]];
  632. a20s = a10s + size[m][ipatch];
  633. a10s = a10s * a10s;
  634. a20s = a20s * a20s;
  635. for (jpatch=loop[jface][0] ; jpatch<=loop[jface][1];jpatch++) {
  636. c11d = place[m][jpatch] - place[1-m][ipatch];
  637. c12d = c11d + size[m][jpatch];
  638. c21d = c11d - size[1-m][ipatch];
  639. c22d = c12d - size[1-m][ipatch];
  640. c11s = c11d * c11d;
  641. c12s = c12d * c12d;
  642. c21s = c21d * c21d;
  643. c22s = c22d * c22d;
  644. b01s = place[1 - m][jpatch] - place[2][ipatch];
  645. b02s = b01s + size[1 - m][jpatch];
  646. /**/
  647. /* Bump the term by a small real to avoid
  648. /* singularities in coupling function:
  649. /**/
  650. b01s = b01s * b01s + 1e-35;
  651. b02s = b02s * b02s + 1e-35;
  652. d11s = a10s + b01s;
  653. d12s = a10s + b02s;
  654. d21s = a20s + b01s;
  655. d22s = a20s + b02s;
  656. d11d = sqrt(d11s);
  657. d12d = sqrt(d12s);
  658. d21d = sqrt(d21s);
  659. d22d = sqrt(d22s);
  660. d11i = 1.0 / d11d;
  661. d12i = 1.0 / d12d;
  662. d21i = 1.0 / d21d;
  663. d22i = 1.0 / d22d;
  664. tmpa = d11d * ( c11d * atan (c11d * d11i)
  665. - c12d * atan (c12d * d11i)
  666. - c21d * atan (c21d * d11i)
  667. + c22d * atan (c22d * d11i))
  668. + d12d * (-c11d * atan (c11d * d12i)
  669. + c12d * atan (c12d * d12i)
  670. + c21d * atan (c21d * d12i)
  671. - c22d * atan (c22d * d12i))
  672. + d21d * (-c11d * atan (c11d * d21i)
  673. + c12d * atan (c12d * d21i)
  674. + c21d * atan (c21d * d21i)
  675. - c22d * atan (c22d * d21i))
  676. + d22d * ( c11d * atan (c11d * d22i)
  677. - c12d * atan (c12d * d22i)
  678. - c21d * atan (c21d * d22i)
  679. + c22d * atan (c22d * d22i));
  680. e1111 = c11s + d11s;
  681. e1211 = c12s + d11s;
  682. e2111 = c21s + d11s;
  683. e2211 = c22s + d11s;
  684. e1112 = c11s + d12s;
  685. e1212 = c12s + d12s;
  686. e2112 = c21s + d12s;
  687. e2212 = c22s + d12s;
  688. e1121 = c11s + d21s;
  689. e1221 = c12s + d21s;
  690. e2121 = c21s + d21s;
  691. e2221 = c22s + d21s;
  692. e1122 = c11s + d22s;
  693. e1222 = c12s + d22s;
  694. e2122 = c21s + d22s;
  695. e2222 = c22s + d22s;
  696. tmpb = c11s * log( e1111 * e1122 / (e1112 * e1121))
  697. - c12s * log( e1211 * e1222 / (e1212 * e1221))
  698. - c21s * log( e2111 * e2122 / (e2112 * e2121))
  699. + c22s * log( e2211 * e2222 / (e2212 * e2221))
  700. - d11s * log( e1111 * e2211 / (e1211 * e2111))
  701. + d12s * log( e1112 * e2212 / (e1212 * e2112))
  702. + d21s * log( e1121 * e2221 / (e1221 * e2121))
  703. - d22s * log( e1122 * e2222 / (e1222 * e2122));
  704. coeff[ipatch][jpatch] = fabs(4.0 * tmpa + tmpb);
  705. coeff[jpatch][ipatch] = coeff[ipatch][jpatch];
  706. }
  707. }
  708. }
  709. }
  710. }
  711. /*****************************************************************************/
  712. /* This routine sets up the radiosity matrix... normalizes row sums to 1, */
  713. /* and includes terms derived from reflectivites and emissivities of faces. */
  714. /*****************************************************************************/
  715. SetUp3 (npatch, loop, area, rho, emiss, coeff, diag, rhs)
  716. int npatch, /* In: Problem size. */
  717. loop[][2]; /* In: Patch number ranges for faces. */
  718. double area[], /* In: 8 * pi * areas of the patches. */
  719. rho[][3], /* In: (RGB) Reflectivities of the face interiors. */
  720. emiss[][3], /* In: (RGB) Emissivities of the face interiors. */
  721. coeff[][NMAX], /* Out: The coefficients of the eqns to solve. */
  722. diag[][NMAX], /* Out: (RGB) Diagonal terms of the system. */
  723. rhs[][NMAX]; /* Out: (RGB) Right-hand sides of system to solve. */
  724. {
  725. /*
  726. * Local variables:
  727. * iface Loop counter over the number of faces.
  728. * ipatch Outer loop counter over the number of patches.
  729. * j Loop counter over each color (R-G-B).
  730. * jpatch Inner loop counter over the number of patches.
  731. * tmp1 double temporary variable.
  732. * vtmp1-2 double vector temporary variables.
  733. */
  734. int j, /* (RGB) Loop counter over each color. */
  735. iface, /* Loop counter over the number of faces. */
  736. ipatch, /* Outer loop counter over the number of patches. */
  737. jpatch; /* Inner loop counter over the number of patches. */
  738. double tmp1, /* Double temporary variable. */
  739. vtmp1[3], /* Double vector temporary variables. */
  740. vtmp2[3]; /* Double vector temporary variables. */
  741. /* Ensure that row sums to 1, and put in reflectivities (rho) and */
  742. /* emissivities. */
  743. for (iface = 0 ; iface < 6 ; iface++) {
  744. for (j = 0 ; j < 3 ; j++) {
  745. vtmp1[j] = 1.0 / rho[iface][j];
  746. vtmp2[j] = emiss[iface][j] * vtmp1[j];
  747. }
  748. for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
  749. tmp1 = 0.0;
  750. for (jpatch = 0 ; jpatch < loop[iface][0] ; jpatch++) {
  751. tmp1 += coeff[ipatch][jpatch];
  752. }
  753. for (jpatch = loop[iface][1]+1 ; jpatch < npatch ; jpatch++) {
  754. tmp1 += coeff[ipatch][jpatch];
  755. }
  756. /* Make sure row sum (total form factor) is close to 1: */
  757. if (fabs(tmp1 - area[ipatch]) > (0.5e-9 * tmp1)) {
  758. printf ("Total form factor is too far from unity.\n");
  759. return (FALSE);
  760. }
  761. tmp1 = -tmp1;
  762. /* Set coplanar patch interactions to zero. */
  763. for (jpatch=loop[iface][0] ; jpatch <= loop[iface][1] ; jpatch++) {
  764. coeff[ipatch][jpatch] = 0.0;
  765. }
  766. /* Assign diagonal entries and right-hand sides. */
  767. for (j = 0 ; j < 3 ; j++) {
  768. diag[j][ipatch] = vtmp1[j] * tmp1;
  769. rhs[j][ipatch] = vtmp2[j] * tmp1;
  770. }
  771. }
  772. }
  773. return (TRUE);
  774. }
  775. /*****************************************************************************/
  776. /* This routine factors and backsolves a real, symmetric, near-dense matrix */
  777. /* by LDL factorization. No pivoting; the matrix is diagonally dominant. */
  778. /*****************************************************************************/
  779. void
  780. Solver (npatch, non0, coeff, diag, rhs, result)
  781. int npatch, /* In: Problem size. */
  782. non0; /* In: Index of first nonzero off-diagonal mat. elem.*/
  783. double coeff[][NMAX], /* In/Out: The coefficients of the eqns to solve. */
  784. diag[][NMAX], /* Out: (RGB) Diagonal terms of the system. */
  785. rhs[][NMAX], /* In: (RGB) Right-hand sides of system to solve. */
  786. result[][NMAX]; /* Out: (RGB) solution radiosities. */
  787. {
  788. int i, j, /* General loop counters. */
  789. k, m; /* General loop counters. */
  790. double tmp1; /* Double temporary variable. */
  791. /* Load lower triangle of coefficients, diagonal, and solution vector. */
  792. for (m = 0 ; m < 3 ; m++) {
  793. for (i = non0 ; i < npatch ; i++) {
  794. coeff[i][i] = diag[m][i];
  795. result[m][i] = rhs[m][i];
  796. for (j = 0 ; j < i ; j++) {
  797. coeff[i][j] = coeff[j][i];
  798. }
  799. }
  800. /* Factor matrix, writing factors on top of original matrix. */
  801. for (j = 0 ; j < non0 ; j++) {
  802. coeff[j][j] = 1.0 / diag[m][j];
  803. result[m][j] = rhs[m][j];
  804. }
  805. for (j = non0 ; j < npatch ; j++) {
  806. for (k = non0 ; k < j ; k++) {
  807. coeff[j][k] -= Ddot (k, &coeff[k][0], 1, &coeff[j][0], 1);
  808. }
  809. for (k = 0 ; k < j ; k++) {
  810. tmp1 = coeff[j][k];
  811. coeff[j][k] = tmp1 * coeff[k][k];
  812. coeff[j][j] -= tmp1 * coeff[j][k];
  813. }
  814. coeff[j][j] = 1.0 / coeff[j][j];
  815. }
  816. /* Backsolve, in three stages (for L, D, and L transpose). */
  817. for (k = non0 ; k < npatch ; k++) {
  818. result[m][k] -= Ddot (k, &result[m][0], 1, &coeff[k][0], 1);
  819. }
  820. for (k = 0 ; k < npatch ; k++) {
  821. result[m][k] *= coeff[k][k];
  822. }
  823. for (k = npatch - 2 ; k >= non0 ; k--) {
  824. result[m][k] -= Ddot (npatch-(k+1), &result[m][k+1], 1,
  825. &coeff[k+1][k], NMAX);
  826. }
  827. for (k = non0 - 1 ; k >= 0 ; k--) {
  828. result[m][k] -= Ddot (npatch-non0, &result[m][non0], 1,
  829. &coeff[non0][k], NMAX);
  830. }
  831. }
  832. }
  833. /*****************************************************************************/
  834. /* The following routine writes the answer to secondary storage. */
  835. /*****************************************************************************/
  836. Storer (npatch, loop, place, size, result)
  837. int npatch, /* In: Problem size. */
  838. loop[][2]; /* In: Patch number ranges for faces. */
  839. double result[][NMAX], /* In: (RGB) Radiosity solutions. */
  840. place[][NMAX], /* In: Width-height-depth positions of patches. */
  841. size[][NMAX]; /* In: Width-height sizes of patches. */
  842. {
  843. int i, /* General loop counter. */
  844. iface, /* Loop counter over number of faces. */
  845. ipatch; /* Loop counter of number of patches within a face. */
  846. FILE *outfile; /* Output file pointer. */
  847. /* Write patch geometry to 'answer' file. */
  848. if ((outfile = fopen("answer", "w")) == NULL) {
  849. printf ("Unable to open 'answer' file.\n");
  850. exit (1);
  851. }
  852. fprintf (outfile, "%d patches:\n", npatch);
  853. fprintf (outfile,
  854. " Patch Face Position in w, h, d Width Height\n");
  855. for (iface = 0 ; iface < 6 ; iface++) {
  856. for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
  857. fprintf (outfile,
  858. "%5d %4d%11.5lf%11.5lf%11.5lf %11.5lf%11.5lf\n",
  859. ipatch+1, iface+1,
  860. place[0][ipatch],
  861. place[1][ipatch],
  862. place[2][ipatch],
  863. size[0][ipatch],
  864. size[1][ipatch]);
  865. }
  866. }
  867. /* Write patch radiosities to 'answer' file. */
  868. fprintf (outfile, "\n Patch Face Radiosities\n");
  869. for (iface = 0 ; iface < 6 ; iface++) {
  870. for (ipatch = loop[iface][0] ; ipatch <= loop[iface][1] ; ipatch++) {
  871. fprintf (outfile, "%5d %4d%12.8lf%12.8lf%12.8lf\n",
  872. ipatch+1, iface+1,
  873. result[0][ipatch],
  874. result[1][ipatch],
  875. result[2][ipatch]);
  876. }
  877. }
  878. fclose(outfile);
  879. }
  880. /*****************************************************************************/
  881. /* This routine verifies that the computed radiosities satisfy the equations.*/
  882. /* */
  883. /* John Gustafson, Diane Rover, Michael Carter, and Stephen Elbert */
  884. /* Ames Laboratory, 3/18/90 */
  885. /*****************************************************************************/
  886. Verify (npatch, coeff, diag, rhs, result)
  887. int npatch; /* In: Problem size. */
  888. double coeff[][NMAX], /* In: The coefficients of the eqns to solve. */
  889. diag[][NMAX], /* In: (RGB) Diagonal terms of the system. */
  890. rhs[][NMAX], /* In: (RGB) Right-hand sides of system to solve. */
  891. result[][NMAX]; /* In: (RGB) Radiosity solutions. */
  892. {
  893. double tmp1, tmp2; /* Double temporary variables. */
  894. double anorm, /* Norm accumulation variable. */
  895. xnorm; /* Norm accumulation variable. */
  896. int i, j, m; /* General loop counters. */
  897. tmp1 = 0.0;
  898. for (m = 0 ; m < 3 ; m++) {
  899. /* Copy lower triangle of coefficients to upper triangle, */
  900. /* and load diagonal. */
  901. for (i = 0 ; i < npatch ; i++) {
  902. coeff[i][i] = diag[m][i];
  903. for (j = 0 ; j < i ; j++) {
  904. coeff[i][j] = coeff[j][i];
  905. }
  906. }
  907. /* Multiply matrix by solution vector, and accum. norm of residual. */
  908. anorm = xnorm = 0.0;
  909. for (j = 0 ; j < npatch ; j++) {
  910. tmp2 = rhs[m][j];
  911. for (i = 0 ; i < npatch ; i++) {
  912. tmp2 -= (coeff[j][i] * result[m][i]);
  913. anorm = MAX(anorm, fabs(coeff[j][i]));
  914. }
  915. xnorm = MAX(xnorm, fabs(result[m][j]));
  916. tmp1 += fabs(tmp2);
  917. }
  918. }
  919. /* printf ("anorm = %g xnorm = %g\n", anorm, xnorm); */
  920. tmp1 /= (anorm * xnorm);
  921. if (tmp1 > 3 * EPS) {
  922. printf ("Residual is too large: %lg\n", tmp1);
  923. return (FALSE);
  924. }
  925. return (TRUE);
  926. }
  927. #ifdef SUN4
  928. /*****************************************************************************/
  929. /* Double precision dot product specifically written for Sun 4/370. */
  930. /* By Michael Carter and John Gustafson, May 30, 1990 */
  931. /* This code unrolls the dot product four ways since that's how many */
  932. /* registers are available on the SPARC. Other RISC system will require */
  933. /* something very similar. Also, unit stride is take advantage of in the */
  934. /* form of special cases. */
  935. /*****************************************************************************/
  936. double
  937. Ddot (n, a, ia, b, ib)
  938. register
  939. int n, /* Number of elements in vectors. */
  940. ia, /* Stride of a vector in ELEMENTS. */
  941. ib; /* Stride of b vector in ELEMENTS. */
  942. register
  943. double *a, /* Pointer to first vector. */
  944. *b; /* Pointer to second vector. */
  945. {
  946. register double sum0 = 0.0,
  947. sum1 = 0.0,
  948. sum2 = 0.0,
  949. sum3 = 0.0;
  950. register int m = n & 3;
  951. int t;
  952. /* The ragged cleanup part. */
  953. while (m--) {
  954. sum0 += *a * *b;
  955. a += ia;
  956. b += ib;
  957. }
  958. /* The fast pipelined part */
  959. n >>= 2;
  960. if (ib == 1 && ia != 1) {
  961. t = ia;
  962. ia = ib;
  963. ib = t;
  964. t = (int) a;
  965. b = a;
  966. a = (double *) t;
  967. }
  968. /* We can optimize if one or more strides are equal to 1. */
  969. if (ia == 1) {
  970. /* This runs if both strides are 1. */
  971. if (ib == 1) {
  972. ia <<= 2;
  973. ib <<= 2;
  974. while (n--) {
  975. sum0 += a[0] * b[0];
  976. sum1 += a[1] * b[1];
  977. sum2 += a[2] * b[2];
  978. sum3 += a[3] * b[3];
  979. a += ia;
  980. b += ib;
  981. }
  982. }
  983. /* This runs if stride of a only is equal to 1. */
  984. else {
  985. ia <<= 2;
  986. while (n--) {
  987. sum0 += a[0] * *b;
  988. b += ib;
  989. sum1 += a[1] * *b;
  990. b += ib;
  991. sum2 += a[2] * *b;
  992. b += ib;
  993. sum3 += a[3] * *b;
  994. a += ia;
  995. b += ib;
  996. }
  997. }
  998. }
  999. /* This runs for the more general case. */
  1000. /* This is about .5 MFLOPS slower on Sun 4/370 */
  1001. else {
  1002. while (n--) {
  1003. sum0 += *a * *b;
  1004. a += ia;
  1005. b += ib;
  1006. sum1 += *a * *b;
  1007. a += ia;
  1008. b += ib;
  1009. sum2 += *a * *b;
  1010. a += ia;
  1011. b += ib;
  1012. sum3 += *a * *b;
  1013. a += ia;
  1014. b += ib;
  1015. }
  1016. }
  1017. return (sum0 + sum1 + sum2 + sum3);
  1018. }
  1019. #else
  1020. /*****************************************************************************/
  1021. /* Generic double-precision dot product. Unrolling will help pipelined */
  1022. /* computers. Modify accordingly. */
  1023. /*****************************************************************************/
  1024. double
  1025. Ddot (n, a, ia, b, ib)
  1026. register
  1027. int n, /* Number of elements in vectors. */
  1028. ia, /* Stride of a vector in ELEMENTS. */
  1029. ib; /* Stride of b vector in ELEMENTS. */
  1030. register
  1031. double *a, /* Pointer to first vector. */
  1032. *b; /* Pointer to second vector. */
  1033. {
  1034. register double sum = 0.0;
  1035. while (n--) {
  1036. sum += *a * *b;
  1037. a += ia;
  1038. b += ib;
  1039. }
  1040. return (sum);
  1041. }
  1042. #endif